Unverified Commit 102b5ff4 authored by wangpeng's avatar wangpeng Committed by GitHub
Browse files

add new model of MGP-STR (#21418)



* add new model of MGP-STR

* fix the check failings

* remove torch and numpy from mgp_tokenization

* remove unused import from modeling_mgp_str

* add test_processing_mgp_str

* rm test_processing_mgp_str.py

* add test_processing_mgp_str

* add test_processing_mgp_str

* add test_processing_mgp_str

* rm test_processing_mgp_str and add softmax outs to model

* rm test_processing_mgp_str and add softmax outs to model

* rewrite the code of mgp-str according to PR suggestions

* rewrite the code of mgp-str according to PR suggestions

* add new model of MGP-STR

* fix the check failings

* remove torch and numpy from mgp_tokenization

* remove unused import from modeling_mgp_str

* add test_processing_mgp_str

* rm test_processing_mgp_str.py

* add test_processing_mgp_str

* add test_processing_mgp_str

* add test_processing_mgp_str

* rm test_processing_mgp_str and add softmax outs to model

* rewrite the code of mgp-str according to PR suggestions

* rewrite the code of mgp-str according to PR suggestions

* remove representation_size from MGPSTRConfig

* reformat configuration_mgp_str.py

* format test_processor_mgp_str.py

* add test for tokenizer and complete model/processer test and model file

* rm Unnecessary tupple in modeling_mgp_str

* reduce hidden_size/layers/label_size in test_model

* add integration tests and change MGPSTR to Mgpstr

* add test for logit values

* reformat test model file

---------
Co-authored-by: default avataryue kun <yuekun.wp@alibaba-inc.com>
parent 32e3466d
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Processor class for MGP-STR."""
import warnings
from transformers import AutoTokenizer
from transformers.utils import is_torch_available
from transformers.utils.generic import ExplicitEnum
from ...processing_utils import ProcessorMixin
if is_torch_available():
import torch
class DecodeType(ExplicitEnum):
CHARACTER = "char"
BPE = "bpe"
WORDPIECE = "wp"
SUPPORTED_ANNOTATION_FORMATS = (DecodeType.CHARACTER, DecodeType.BPE, DecodeType.WORDPIECE)
class MgpstrProcessor(ProcessorMixin):
r"""
Constructs a MGP-STR processor which wraps an image processor and MGP-STR tokenizers into a single
[`MgpstrProcessor`] offers all the functionalities of `ViTImageProcessor`] and [`MgpstrTokenizer`]. See the
[`~MgpstrProcessor.__call__`] and [`~MgpstrProcessor.batch_decode`] for more information.
Args:
image_processor (`ViTImageProcessor`):
An instance of `ViTImageProcessor`. The image processor is a required input.
tokenizer ([`MgpstrTokenizer`]):
The tokenizer is a required input.
"""
attributes = ["image_processor", "char_tokenizer"]
image_processor_class = "ViTImageProcessor"
char_tokenizer_class = "MgpstrTokenizer"
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
if "feature_extractor" in kwargs:
warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")
image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
self.char_tokenizer = tokenizer
self.bpe_tokenizer = AutoTokenizer.from_pretrained("gpt2")
self.wp_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
super().__init__(image_processor, tokenizer)
def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
"""
When used in normal mode, this method forwards all its arguments to ViTImageProcessor's
[`~ViTImageProcessor.__call__`] and returns its output. This method also forwards the `text` and `kwargs`
arguments to MgpstrTokenizer's [`~MgpstrTokenizer.__call__`] if `text` is not `None` to encode the text. Please
refer to the doctsring of the above methods for more information.
"""
if images is None and text is None:
raise ValueError("You need to specify either an `images` or `text` input to process.")
if images is not None:
inputs = self.image_processor(images, return_tensors=return_tensors, **kwargs)
if text is not None:
encodings = self.char_tokenizer(text, return_tensors=return_tensors, **kwargs)
if text is None:
return inputs
elif images is None:
return encodings
else:
inputs["labels"] = encodings["input_ids"]
return inputs
def batch_decode(self, sequences):
"""
Convert a list of lists of token ids into a list of strings by calling decode.
Args:
sequences (`torch.Tensor`):
List of tokenized input ids.
Returns:
`Dict[str, any]`: Dictionary of all the outputs of the decoded results.
generated_text (`List[str]`): The final results after fusion of char, bpe, and wp. scores
(`List[float]`): The final scores after fusion of char, bpe, and wp. char_preds (`List[str]`): The list
of character decoded sentences. bpe_preds (`List[str]`): The list of bpe decoded sentences. wp_preds
(`List[str]`): The list of wp decoded sentences.
This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
"""
char_preds, bpe_preds, wp_preds = sequences
batch_size = char_preds.size(0)
char_strs, char_scores = self._decode_helper(char_preds, "char")
bpe_strs, bpe_scores = self._decode_helper(bpe_preds, "bpe")
wp_strs, wp_scores = self._decode_helper(wp_preds, "wp")
final_strs = []
final_scores = []
for i in range(batch_size):
scores = [char_scores[i], bpe_scores[i], wp_scores[i]]
strs = [char_strs[i], bpe_strs[i], wp_strs[i]]
max_score_index = scores.index(max(scores))
final_strs.append(strs[max_score_index])
final_scores.append(scores[max_score_index])
out = {}
out["generated_text"] = final_strs
out["scores"] = final_scores
out["char_preds"] = char_strs
out["bpe_preds"] = bpe_strs
out["wp_preds"] = wp_strs
return out
def _decode_helper(self, pred_logits, format):
"""
Convert a list of lists of bpe token ids into a list of strings by calling bpe tokenizer.
Args:
pred_logits (`torch.Tensor`):
List of model prediction logits.
format (`Union[DecoderType, str]`):
Type of model prediction. Must be one of ['char', 'bpe', 'wp'].
Returns:
`tuple`:
dec_strs(`str`): The decode strings of model prediction. conf_scores(`List[float]`): The confidence
score of model prediction.
"""
if format == DecodeType.CHARACTER:
decoder = self.char_decode
eos_token = 1
eos_str = "[s]"
elif format == DecodeType.BPE:
decoder = self.bpe_decode
eos_token = 2
eos_str = "#"
elif format == DecodeType.WORDPIECE:
decoder = self.wp_decode
eos_token = 102
eos_str = "[SEP]"
else:
raise ValueError(f"Format {format} is not supported.")
dec_strs, conf_scores = [], []
batch_size = pred_logits.size(0)
batch_max_length = pred_logits.size(1)
_, preds_index = pred_logits.topk(1, dim=-1, largest=True, sorted=True)
preds_index = preds_index.view(-1, batch_max_length)[:, 1:]
preds_str = decoder(preds_index)
preds_max_prob, _ = torch.nn.functional.softmax(pred_logits, dim=2).max(dim=2)
preds_max_prob = preds_max_prob[:, 1:]
for index in range(batch_size):
pred_eos = preds_str[index].find(eos_str)
pred = preds_str[index][:pred_eos]
pred_index = preds_index[index].cpu().tolist()
pred_eos_index = pred_index.index(eos_token) if eos_token in pred_index else -1
pred_max_prob = preds_max_prob[index][: pred_eos_index + 1]
confidence_score = pred_max_prob.cumprod(dim=0)[-1] if pred_max_prob.nelement() != 0 else 0.0
dec_strs.append(pred)
conf_scores.append(confidence_score)
return dec_strs, conf_scores
def char_decode(self, sequences):
"""
Convert a list of lists of char token ids into a list of strings by calling char tokenizer.
Args:
sequences (`torch.Tensor`):
List of tokenized input ids.
Returns:
`List[str]`: The list of char decoded sentences.
"""
decode_strs = [seq.replace(" ", "") for seq in self.char_tokenizer.batch_decode(sequences)]
return decode_strs
def bpe_decode(self, sequences):
"""
Convert a list of lists of bpe token ids into a list of strings by calling bpe tokenizer.
Args:
sequences (`torch.Tensor`):
List of tokenized input ids.
Returns:
`List[str]`: The list of bpe decoded sentences.
"""
return self.bpe_tokenizer.batch_decode(sequences)
def wp_decode(self, sequences):
"""
Convert a list of lists of word piece token ids into a list of strings by calling word piece tokenizer.
Args:
sequences (`torch.Tensor`):
List of tokenized input ids.
Returns:
`List[str]`: The list of wp decoded sentences.
"""
decode_strs = [seq.replace(" ", "") for seq in self.wp_tokenizer.batch_decode(sequences)]
return decode_strs
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for MGT-STR CHAR."""
import json
import os
from typing import Optional, Tuple
from ...tokenization_utils import PreTrainedTokenizer
from ...utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json"}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"mgp-str": "https://huggingface.co/alibaba-damo/mgp-str-base/blob/main/vocab.json",
}
}
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"mgp-str": 27}
class MgpstrTokenizer(PreTrainedTokenizer):
"""
Construct a MGP-STR char tokenizer.
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.
Args:
vocab_file (`str`):
Path to the vocabulary file.
unk_token (`str`, *optional*, defaults to `"[GO]"`):
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
token instead.
bos_token (`str`, *optional*, defaults to `"[GO]"`):
The beginning of sequence token.
eos_token (`str`, *optional*, defaults to `"[s]"`):
The end of sequence token.
pad_token (`str` or `tokenizers.AddedToken`, *optional*, , defaults to `"[GO]"`):
A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
attention mechanisms or loss computation.
"""
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(self, vocab_file, unk_token="[GO]", bos_token="[GO]", eos_token="[s]", pad_token="[GO]", **kwargs):
super().__init__(
unk_token=unk_token,
bos_token=bos_token,
eos_token=eos_token,
pad_token=pad_token,
**kwargs,
)
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.vocab = json.load(vocab_handle)
self.decoder = {v: k for k, v in self.vocab.items()}
@property
def vocab_size(self):
return len(self.vocab)
def get_vocab(self):
return dict(self.vocab, **self.added_tokens_encoder)
def _tokenize(self, text):
"""Tokenize a string."""
char_tokens = []
for s in text:
char_tokens.extend(s)
return char_tokens
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.vocab.get(token, self.vocab.get(self.unk_token))
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return self.decoder.get(index)
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
return
vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)
with open(vocab_file, "w", encoding="utf-8") as f:
f.write(json.dumps(self.vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
return (vocab_file,)
...@@ -4219,6 +4219,30 @@ class MegatronBertPreTrainedModel(metaclass=DummyObject): ...@@ -4219,6 +4219,30 @@ class MegatronBertPreTrainedModel(metaclass=DummyObject):
requires_backends(self, ["torch"]) requires_backends(self, ["torch"])
MGP_STR_PRETRAINED_MODEL_ARCHIVE_LIST = None
class MgpstrForSceneTextRecognition(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class MgpstrModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class MgpstrPreTrainedModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
class MMBTForClassification(metaclass=DummyObject): class MMBTForClassification(metaclass=DummyObject):
_backends = ["torch"] _backends = ["torch"]
......
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Testing suite for the PyTorch MGP-STR model. """
import inspect
import unittest
import requests
from transformers import MgpstrConfig
from transformers.testing_utils import require_torch, require_vision, slow, torch_device
from transformers.utils import is_torch_available, is_vision_available
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
if is_torch_available():
import torch
from torch import nn
from transformers import MgpstrForSceneTextRecognition
if is_vision_available():
from PIL import Image
from transformers import MgpstrProcessor
class MgpstrModelTester:
def __init__(
self,
parent,
is_training=False,
batch_size=13,
image_size=(32, 128),
patch_size=4,
num_channels=3,
max_token_length=27,
num_character_labels=38,
num_bpe_labels=99,
num_wordpiece_labels=99,
hidden_size=32,
num_hidden_layers=5,
num_attention_heads=4,
mlp_ratio=4.0,
patch_embeds_hidden_size=257,
output_hidden_states=None,
):
self.parent = parent
self.is_training = is_training
self.batch_size = batch_size
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.max_token_length = max_token_length
self.num_character_labels = num_character_labels
self.num_bpe_labels = num_bpe_labels
self.num_wordpiece_labels = num_wordpiece_labels
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.mlp_ratio = mlp_ratio
self.patch_embeds_hidden_size = patch_embeds_hidden_size
self.output_hidden_states = output_hidden_states
def prepare_config_and_inputs(self):
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size[0], self.image_size[1]])
config = self.get_config()
return config, pixel_values
def get_config(self):
return MgpstrConfig(
image_size=self.image_size,
patch_size=self.patch_size,
num_channels=self.num_channels,
max_token_length=self.max_token_length,
num_character_labels=self.num_character_labels,
num_bpe_labels=self.num_bpe_labels,
num_wordpiece_labels=self.num_wordpiece_labels,
hidden_size=self.hidden_size,
num_hidden_layers=self.num_hidden_layers,
num_attention_heads=self.num_attention_heads,
mlp_ratio=self.mlp_ratio,
output_hidden_states=self.output_hidden_states,
)
def create_and_check_model(self, config, pixel_values):
model = MgpstrForSceneTextRecognition(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
generated_ids = model(pixel_values)
self.parent.assertEqual(
generated_ids[0][0].shape, (self.batch_size, self.max_token_length, self.num_character_labels)
)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, pixel_values = config_and_inputs
inputs_dict = {"pixel_values": pixel_values}
return config, inputs_dict
@require_torch
class MgpstrModelTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (MgpstrForSceneTextRecognition,) if is_torch_available() else ()
fx_compatible = False
test_pruning = False
test_resize_embeddings = False
test_head_masking = False
test_attention_outputs = False
def setUp(self):
self.model_tester = MgpstrModelTester(self)
self.config_tester = ConfigTester(self, config_class=MgpstrConfig, has_text_modality=False)
def test_config(self):
self.config_tester.run_common_tests()
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
@unittest.skip(reason="MgpstrModel does not use inputs_embeds")
def test_inputs_embeds(self):
pass
def test_model_common_attributes(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
x = model.get_output_embeddings()
self.assertTrue(x is None or isinstance(x, nn.Linear))
def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
signature = inspect.signature(model.forward)
# signature.parameters is an OrderedDict => so arg_names order is deterministic
arg_names = [*signature.parameters.keys()]
expected_arg_names = ["pixel_values"]
self.assertListEqual(arg_names[:1], expected_arg_names)
@unittest.skip(reason="MgpstrModel does not support feedforward chunking")
def test_feed_forward_chunking(self):
pass
def test_gradient_checkpointing_backward_compatibility(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
if not model_class.supports_gradient_checkpointing:
continue
config.gradient_checkpointing = True
model = model_class(config)
self.assertTrue(model.is_gradient_checkpointing)
def test_hidden_states_output(self):
def check_hidden_states_output(inputs_dict, config, model_class):
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
hidden_states = outputs.hidden_states
expected_num_layers = getattr(
self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
)
self.assertEqual(len(hidden_states), expected_num_layers)
self.assertListEqual(
list(hidden_states[0].shape[-2:]),
[self.model_tester.patch_embeds_hidden_size, self.model_tester.hidden_size],
)
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
inputs_dict["output_hidden_states"] = True
check_hidden_states_output(inputs_dict, config, model_class)
# check that output_hidden_states also work using config
del inputs_dict["output_hidden_states"]
config.output_hidden_states = True
check_hidden_states_output(inputs_dict, config, model_class)
# override as the `logit_scale` parameter initilization is different for MgpstrModel
def test_initialization(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
configs_no_init = _config_zero_init(config)
for model_class in self.all_model_classes:
model = model_class(config=configs_no_init)
for name, param in model.named_parameters():
if isinstance(param, (nn.Linear, nn.Conv2d, nn.LayerNorm)):
if param.requires_grad:
self.assertIn(
((param.data.mean() * 1e9).round() / 1e9).item(),
[0.0, 1.0],
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
)
@unittest.skip(reason="Retain_grad is tested in individual model tests")
def test_retain_grad_hidden_states_attentions(self):
pass
# We will verify our results on an image from the IIIT-5k dataset
def prepare_img():
url = "https://i.postimg.cc/ZKwLg2Gw/367-14.png"
im = Image.open(requests.get(url, stream=True).raw).convert("RGB")
return im
@require_vision
@require_torch
class MgpstrModelIntegrationTest(unittest.TestCase):
@slow
def test_inference(self):
model_name = "alibaba-damo/mgp-str-base"
model = MgpstrForSceneTextRecognition.from_pretrained(model_name).to(torch_device)
processor = MgpstrProcessor.from_pretrained(model_name)
image = prepare_img()
inputs = processor(images=image, return_tensors="pt").pixel_values.to(torch_device)
# forward pass
with torch.no_grad():
outputs = model(inputs)
# verify the logits
self.assertEqual(outputs.logits[0].shape, torch.Size((1, 27, 38)))
out_strs = processor.batch_decode(outputs.logits)
expected_text = "ticket"
self.assertEqual(out_strs["generated_text"][0], expected_text)
expected_slice = torch.tensor(
[[[-39.7358, -44.8562, -36.6253], [-62.3605, -64.5908, -59.0069], [-74.6127, -68.9724, -71.7150]]],
device=torch_device,
)
self.assertTrue(torch.allclose(outputs.logits[0][:, 1:4, 1:4], expected_slice, atol=1e-4))
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Testing suite for the MgpstrProcessor. """
import json
import os
import shutil
import tempfile
import unittest
import numpy as np
import pytest
from transformers import MgpstrTokenizer
from transformers.models.mgp_str.tokenization_mgp_str import VOCAB_FILES_NAMES
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import IMAGE_PROCESSOR_NAME, is_torch_available, is_vision_available
if is_torch_available():
import torch
if is_vision_available():
from PIL import Image
from transformers import MgpstrProcessor, ViTImageProcessor
@require_torch
@require_vision
class MgpstrProcessorTest(unittest.TestCase):
image_processing_class = ViTImageProcessor if is_vision_available() else None
@property
def image_processor_dict(self):
return self.image_processor_tester.prepare_image_processor_dict()
def setUp(self):
self.image_size = (3, 32, 128)
self.tmpdirname = tempfile.mkdtemp()
# fmt: off
vocab = ['[GO]', '[s]', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
# fmt: on
vocab_tokens = dict(zip(vocab, range(len(vocab))))
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
image_processor_map = {
"do_normalize": False,
"do_resize": True,
"feature_extractor_type": "ViTFeatureExtractor",
"resample": 3,
"size": {"height": 32, "width": 128},
}
self.image_processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME)
with open(self.image_processor_file, "w", encoding="utf-8") as fp:
json.dump(image_processor_map, fp)
def get_tokenizer(self, **kwargs):
return MgpstrTokenizer.from_pretrained(self.tmpdirname, **kwargs)
def get_image_processor(self, **kwargs):
return ViTImageProcessor.from_pretrained(self.tmpdirname, **kwargs)
def tearDown(self):
shutil.rmtree(self.tmpdirname)
def prepare_image_inputs(self):
"""This function prepares a list of PIL images."""
image_input = np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)
image_input = Image.fromarray(np.moveaxis(image_input, 0, -1))
return image_input
def test_save_load_pretrained_default(self):
tokenizer = self.get_tokenizer()
image_processor = self.get_image_processor()
processor = MgpstrProcessor(tokenizer=tokenizer, image_processor=image_processor)
processor.save_pretrained(self.tmpdirname)
processor = MgpstrProcessor.from_pretrained(self.tmpdirname, use_fast=False)
self.assertEqual(processor.char_tokenizer.get_vocab(), tokenizer.get_vocab())
self.assertIsInstance(processor.char_tokenizer, MgpstrTokenizer)
self.assertEqual(processor.image_processor.to_json_string(), image_processor.to_json_string())
self.assertIsInstance(processor.image_processor, ViTImageProcessor)
def test_save_load_pretrained_additional_features(self):
tokenizer = self.get_tokenizer()
image_processor = self.get_image_processor()
processor = MgpstrProcessor(tokenizer=tokenizer, image_processor=image_processor)
processor.save_pretrained(self.tmpdirname)
tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)")
image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0)
processor = MgpstrProcessor.from_pretrained(
self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0
)
self.assertEqual(processor.char_tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab())
self.assertIsInstance(processor.char_tokenizer, MgpstrTokenizer)
self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string())
self.assertIsInstance(processor.image_processor, ViTImageProcessor)
def test_image_processor(self):
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = MgpstrProcessor(tokenizer=tokenizer, image_processor=image_processor)
image_input = self.prepare_image_inputs()
input_image_proc = image_processor(image_input, return_tensors="np")
input_processor = processor(images=image_input, return_tensors="np")
for key in input_image_proc.keys():
self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2)
def test_tokenizer(self):
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = MgpstrProcessor(tokenizer=tokenizer, image_processor=image_processor)
input_str = "test"
encoded_processor = processor(text=input_str)
encoded_tok = tokenizer(input_str)
for key in encoded_tok.keys():
self.assertListEqual(encoded_tok[key], encoded_processor[key])
def test_processor(self):
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = MgpstrProcessor(tokenizer=tokenizer, image_processor=image_processor)
input_str = "test"
image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input)
self.assertListEqual(list(inputs.keys()), ["pixel_values", "labels"])
# test if it raises when no input is passed
with pytest.raises(ValueError):
processor()
def test_tokenizer_decode(self):
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = MgpstrProcessor(tokenizer=tokenizer, image_processor=image_processor)
predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9], [3, 4, 3, 1, 1, 8, 9]]
decoded_processor = processor.char_decode(predicted_ids)
decoded_tok = tokenizer.batch_decode(predicted_ids)
decode_strs = [seq.replace(" ", "") for seq in decoded_tok]
self.assertListEqual(decode_strs, decoded_processor)
def test_model_input_names(self):
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = MgpstrProcessor(tokenizer=tokenizer, image_processor=image_processor)
input_str = None
image_input = self.prepare_image_inputs()
inputs = processor(text=input_str, images=image_input)
self.assertListEqual(list(inputs.keys()), processor.model_input_names)
def test_processor_batch_decode(self):
image_processor = self.get_image_processor()
tokenizer = self.get_tokenizer()
processor = MgpstrProcessor(tokenizer=tokenizer, image_processor=image_processor)
char_input = torch.randn(1, 27, 38)
bpe_input = torch.randn(1, 27, 50257)
wp_input = torch.randn(1, 27, 30522)
results = processor.batch_decode([char_input, bpe_input, wp_input])
self.assertListEqual(list(results.keys()), ["generated_text", "scores", "char_preds", "bpe_preds", "wp_preds"])
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import unittest
from transformers import MgpstrTokenizer
from transformers.models.mgp_str.tokenization_mgp_str import VOCAB_FILES_NAMES
from transformers.testing_utils import require_tokenizers
from ...test_tokenization_common import TokenizerTesterMixin
@require_tokenizers
class MgpstrTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = MgpstrTokenizer
test_rust_tokenizer = False
from_pretrained_kwargs = {}
test_seq2seq = False
def setUp(self):
super().setUp()
# fmt: off
vocab = ['[GO]', '[s]', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
# fmt: on
vocab_tokens = dict(zip(vocab, range(len(vocab))))
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
with open(self.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens) + "\n")
def get_tokenizer(self, **kwargs):
return MgpstrTokenizer.from_pretrained(self.tmpdirname, **kwargs)
def get_input_output_texts(self, tokenizer):
input_text = "tester"
output_text = "tester"
return input_text, output_text
@unittest.skip("MGP-STR always lower cases letters.")
def test_added_tokens_do_lower_case(self):
pass
def test_add_special_tokens(self):
tokenizers = self.get_tokenizers(do_lower_case=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
special_token = "[SPECIAL_TOKEN]"
tokenizer.add_special_tokens({"cls_token": special_token})
encoded_special_token = tokenizer.encode([special_token], add_special_tokens=False)
self.assertEqual(len(encoded_special_token), 1)
decoded = tokenizer.decode(encoded_special_token, skip_special_tokens=True)
self.assertTrue(special_token not in decoded)
def test_internal_consistency(self):
tokenizers = self.get_tokenizers()
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
input_text, output_text = self.get_input_output_texts(tokenizer)
tokens = tokenizer.tokenize(input_text)
ids = tokenizer.convert_tokens_to_ids(tokens)
ids_2 = tokenizer.encode(input_text, add_special_tokens=False)
self.assertListEqual(ids, ids_2)
tokens_2 = tokenizer.convert_ids_to_tokens(ids)
self.assertNotEqual(len(tokens_2), 0)
text_2 = tokenizer.decode(ids)
self.assertIsInstance(text_2, str)
self.assertEqual(text_2.replace(" ", ""), output_text)
@unittest.skip("MGP-STR tokenizer only handles one sequence.")
def test_maximum_encoding_length_pair_input(self):
pass
@unittest.skip("inputs cannot be pretokenized in MgpstrTokenizer")
def test_pretokenized_inputs(self):
pass
...@@ -95,6 +95,7 @@ IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [ ...@@ -95,6 +95,7 @@ IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [
"M2M100Encoder", # Building part of bigger (tested) model. "M2M100Encoder", # Building part of bigger (tested) model.
"M2M100Decoder", # Building part of bigger (tested) model. "M2M100Decoder", # Building part of bigger (tested) model.
"MCTCTEncoder", # Building part of bigger (tested) model. "MCTCTEncoder", # Building part of bigger (tested) model.
"MgpstrModel", # Building part of bigger (tested) model.
"Speech2TextEncoder", # Building part of bigger (tested) model. "Speech2TextEncoder", # Building part of bigger (tested) model.
"Speech2TextDecoder", # Building part of bigger (tested) model. "Speech2TextDecoder", # Building part of bigger (tested) model.
"LEDEncoder", # Building part of bigger (tested) model. "LEDEncoder", # Building part of bigger (tested) model.
...@@ -269,6 +270,7 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [ ...@@ -269,6 +270,7 @@ IGNORE_NON_AUTO_CONFIGURED = PRIVATE_MODELS.copy() + [
"LukeForEntityClassification", "LukeForEntityClassification",
"LukeForEntityPairClassification", "LukeForEntityPairClassification",
"LukeForEntitySpanClassification", "LukeForEntitySpanClassification",
"MgpstrModel",
"OpenAIGPTDoubleHeadsModel", "OpenAIGPTDoubleHeadsModel",
"OwlViTTextModel", "OwlViTTextModel",
"OwlViTVisionModel", "OwlViTVisionModel",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment