Commit 31ebceb5 authored by myhloli's avatar myhloli
Browse files

feat(model): add UniMERNet model configuration and processing files

- Add UnimerMBartConfig and UnimerSwinConfig classes
- Implement UnimerSwinImageProcessor for image preprocessing- Create necessary __init__.py files for module structure
parent 1df26448
from .unimer_swin import UnimerSwinConfig, UnimerSwinModel, UnimerSwinImageProcessor
from .unimer_mbart import UnimerMBartConfig, UnimerMBartModel, UnimerMBartForCausalLM
from .modeling_unimernet import UnimernetModel
__all__ = [
"UnimerSwinConfig",
"UnimerSwinModel",
"UnimerSwinImageProcessor",
"UnimerMBartConfig",
"UnimerMBartModel",
"UnimerMBartForCausalLM",
"UnimernetModel",
]
import os
import re
import warnings
from typing import Optional
import torch
from ftfy import fix_text
from transformers import AutoConfig, AutoModel, AutoModelForCausalLM, AutoTokenizer, PretrainedConfig, PreTrainedModel
from transformers import VisionEncoderDecoderConfig, VisionEncoderDecoderModel
from transformers.models.vision_encoder_decoder.modeling_vision_encoder_decoder import logger as base_model_logger
from .unimer_swin import UnimerSwinConfig, UnimerSwinModel, UnimerSwinImageProcessor
from .unimer_mbart import UnimerMBartConfig, UnimerMBartForCausalLM
AutoConfig.register(UnimerSwinConfig.model_type, UnimerSwinConfig)
AutoConfig.register(UnimerMBartConfig.model_type, UnimerMBartConfig)
AutoModel.register(UnimerSwinConfig, UnimerSwinModel)
AutoModelForCausalLM.register(UnimerMBartConfig, UnimerMBartForCausalLM)
# TODO: rewrite tokenizer
class TokenizerWrapper:
def __init__(self, tokenizer):
self.tokenizer = tokenizer
self.pad_token_id = self.tokenizer.pad_token_id
self.bos_token_id = self.tokenizer.bos_token_id
self.eos_token_id = self.tokenizer.eos_token_id
def __len__(self):
return len(self.tokenizer)
def tokenize(self, text, **kwargs):
return self.tokenizer(
text,
return_token_type_ids=False,
return_tensors="pt",
padding="longest",
truncation=True,
**kwargs,
)
def token2str(self, tokens) -> list:
generated_text = self.tokenizer.batch_decode(tokens, skip_special_tokens=True)
generated_text = [fix_text(text) for text in generated_text]
return generated_text
def detokenize(self, tokens):
toks = [self.tokenizer.convert_ids_to_tokens(tok) for tok in tokens]
for b in range(len(toks)):
for i in reversed(range(len(toks[b]))):
if toks[b][i] is None:
toks[b][i] = ''
toks[b][i] = toks[b][i].replace('Ġ', ' ').strip()
if toks[b][i] in ([self.tokenizer.bos_token, self.tokenizer.eos_token, self.tokenizer.pad_token]):
del toks[b][i]
return toks
def latex_rm_whitespace(s: str):
"""Remove unnecessary whitespace from LaTeX code.
"""
text_reg = r'(\\(operatorname|mathrm|text|mathbf)\s?\*? {.*?})'
letter = r'[a-zA-Z]'
noletter = r'[\W_^\d]'
names = [x[0].replace(' ', '') for x in re.findall(text_reg, s)]
s = re.sub(text_reg, lambda _: str(names.pop(0)), s)
news = s
while True:
s = news
news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, noletter), r'\1\2', s)
news = re.sub(r'(?!\\ )(%s)\s+?(%s)' % (noletter, letter), r'\1\2', news)
news = re.sub(r'(%s)\s+?(%s)' % (letter, noletter), r'\1\2', news)
if news == s:
break
return s
class UnimernetModel(VisionEncoderDecoderModel):
def __init__(
self,
config: Optional[PretrainedConfig] = None,
encoder: Optional[PreTrainedModel] = None,
decoder: Optional[PreTrainedModel] = None,
):
# VisionEncoderDecoderModel's checking log has bug, disable for temp.
base_model_logger.disabled = True
try:
super().__init__(config, encoder, decoder)
finally:
base_model_logger.disabled = False
if not config or not hasattr(config, "_name_or_path"):
raise RuntimeError("config._name_or_path is required by UnimernetModel.")
model_path = config._name_or_path
self.transform = UnimerSwinImageProcessor()
self.tokenizer = TokenizerWrapper(AutoTokenizer.from_pretrained(model_path))
self._post_check()
def _post_check(self):
tokenizer = self.tokenizer
if tokenizer.tokenizer.model_max_length != self.config.decoder.max_position_embeddings:
warnings.warn(
f"decoder.max_position_embeddings={self.config.decoder.max_position_embeddings}," +
f" but tokenizer.model_max_length={tokenizer.tokenizer.model_max_length}, will set" +
f" tokenizer.model_max_length to {self.config.decoder.max_position_embeddings}.")
tokenizer.tokenizer.model_max_length = self.config.decoder.max_position_embeddings
assert self.config.decoder.vocab_size == len(tokenizer)
assert self.config.decoder_start_token_id == tokenizer.bos_token_id
assert self.config.pad_token_id == tokenizer.pad_token_id
@classmethod
def from_checkpoint(cls, model_path: str, model_filename: str = "pytorch_model.pth", state_dict_strip_prefix="model.model."):
config = VisionEncoderDecoderConfig.from_pretrained(model_path)
config._name_or_path = model_path
config.encoder = UnimerSwinConfig(**vars(config.encoder))
config.decoder = UnimerMBartConfig(**vars(config.decoder))
encoder = UnimerSwinModel(config.encoder)
decoder = UnimerMBartForCausalLM(config.decoder)
model = cls(config, encoder, decoder)
# load model weights
model_file_path = os.path.join(model_path, model_filename)
checkpoint = torch.load(model_file_path, map_location="cpu", weights_only=True)
state_dict = checkpoint["model"] if "model" in checkpoint else checkpoint
if not state_dict:
raise RuntimeError("state_dict is empty.")
if state_dict_strip_prefix:
state_dict = {
k[len(state_dict_strip_prefix):] if k.startswith(state_dict_strip_prefix) else k: v
for k, v in state_dict.items()
}
missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
if len(unexpected_keys) > 0:
warnings.warn("Unexpected key(s) in state_dict: {}.".format(", ".join(f'"{k}"' for k in unexpected_keys)))
if len(missing_keys) > 0:
raise RuntimeError("Missing key(s) in state_dict: {}.".format(", ".join(f'"{k}"' for k in missing_keys)))
return model
def forward_bak(self, samples):
pixel_values, text = samples["image"], samples["text_input"]
text_inputs = self.tokenizer.tokenize(text).to(pixel_values.device)
decoder_input_ids, decoder_attention_mask = text_inputs["input_ids"], text_inputs["attention_mask"]
num_channels = pixel_values.shape[1]
if num_channels == 1:
pixel_values = pixel_values.repeat(1, 3, 1, 1)
labels = decoder_input_ids * 1
labels = labels.masked_fill(labels == self.tokenizer.pad_token_id, -100)
loss = self.model(
pixel_values=pixel_values,
decoder_input_ids=decoder_input_ids[:, :-1],
decoder_attention_mask=decoder_attention_mask[:, :-1],
labels=labels[:, 1:],
).loss
return {"loss": loss}
def generate(self, samples, do_sample: bool = False, temperature: float = 0.2, top_p: float = 0.95):
pixel_values = samples["image"]
num_channels = pixel_values.shape[1]
if num_channels == 1:
pixel_values = pixel_values.repeat(1, 3, 1, 1)
kwargs = {}
if do_sample:
kwargs["temperature"] = temperature
kwargs["top_p"] = top_p
outputs = super().generate(
pixel_values=pixel_values,
max_new_tokens=self.tokenizer.tokenizer.model_max_length, # required
decoder_start_token_id=self.tokenizer.tokenizer.bos_token_id,
do_sample=do_sample,
**kwargs,
)
outputs = outputs[:, 1:].cpu().numpy()
pred_tokens = self.tokenizer.detokenize(outputs)
pred_str = self.tokenizer.token2str(outputs)
fixed_str = [latex_rm_whitespace(s) for s in pred_str]
return {"pred_ids": outputs, "pred_tokens": pred_tokens, "pred_str": pred_str, "fixed_str": fixed_str}
from .configuration_unimer_mbart import UnimerMBartConfig
from .modeling_unimer_mbart import UnimerMBartModel, UnimerMBartForCausalLM
__all__ = [
"UnimerMBartConfig",
"UnimerMBartModel",
"UnimerMBartForCausalLM",
]
# coding=utf-8
# Copyright 2021, The Facebook AI Research Team and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""UnimerMBART model configuration"""
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
class UnimerMBartConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`MBartModel`]. It is used to instantiate an MBART
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the MBART
[facebook/mbart-large-cc25](https://huggingface.co/facebook/mbart-large-cc25) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 50265):
Vocabulary size of the MBART model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`MBartModel`] or [`TFMBartModel`].
d_model (`int`, *optional*, defaults to 1024):
Dimensionality of the layers and the pooler layer.
qk_squeeze (`int`, *optional*, defaults to 2):
Squeeze ratio for query/key's output dimension. See the [UniMERNet paper](https://arxiv.org/abs/2404.15254).
Squeeze Attention maps the query and key to a lower-dimensional space without excessive loss of information,
thereby accelerating the computation of attention.
encoder_layers (`int`, *optional*, defaults to 12):
Number of encoder layers.
decoder_layers (`int`, *optional*, defaults to 12):
Number of decoder layers.
encoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
decoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer decoder.
decoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
encoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"silu"` and `"gelu_new"` are supported.
dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer.
classifier_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for classifier.
max_position_embeddings (`int`, *optional*, defaults to 1024):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
init_std (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
encoder_layerdrop (`float`, *optional*, defaults to 0.0):
The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
for more details.
decoder_layerdrop (`float`, *optional*, defaults to 0.0):
The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
for more details.
scale_embedding (`bool`, *optional*, defaults to `False`):
Scale embeddings by diving by sqrt(d_model).
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models)
forced_eos_token_id (`int`, *optional*, defaults to 2):
The id of the token to force as the last generated token when `max_length` is reached. Usually set to
`eos_token_id`.
Example:
```python
>>> from transformers import MBartConfig, MBartModel
>>> # Initializing a MBART facebook/mbart-large-cc25 style configuration
>>> configuration = MBartConfig()
>>> # Initializing a model (with random weights) from the facebook/mbart-large-cc25 style configuration
>>> model = MBartModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "unimer-mbart"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
def __init__(
self,
vocab_size=50265,
max_position_embeddings=1024,
encoder_layers=12,
encoder_ffn_dim=4096,
encoder_attention_heads=16,
decoder_layers=12,
decoder_ffn_dim=4096,
decoder_attention_heads=16,
encoder_layerdrop=0.0,
decoder_layerdrop=0.0,
use_cache=True,
is_encoder_decoder=True,
activation_function="gelu",
d_model=1024,
qk_squeeze=2,
dropout=0.1,
attention_dropout=0.0,
activation_dropout=0.0,
init_std=0.02,
classifier_dropout=0.0,
scale_embedding=False,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
forced_eos_token_id=2,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.d_model = d_model
self.qk_squeeze = qk_squeeze
self.encoder_ffn_dim = encoder_ffn_dim
self.encoder_layers = encoder_layers
self.encoder_attention_heads = encoder_attention_heads
self.decoder_ffn_dim = decoder_ffn_dim
self.decoder_layers = decoder_layers
self.decoder_attention_heads = decoder_attention_heads
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.activation_function = activation_function
self.init_std = init_std
self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop
self.classifier_dropout = classifier_dropout
self.use_cache = use_cache
self.num_hidden_layers = encoder_layers
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
forced_eos_token_id=forced_eos_token_id,
**kwargs,
)
from .configuration_unimer_swin import UnimerSwinConfig
from .modeling_unimer_swin import UnimerSwinModel
from .image_processing_unimer_swin import UnimerSwinImageProcessor
__all__ = [
"UnimerSwinConfig",
"UnimerSwinModel",
"UnimerSwinImageProcessor",
]
# coding=utf-8
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Donut Swin Transformer model configuration"""
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
class UnimerSwinConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`UnimerSwinModel`]. It is used to instantiate a
Donut model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the Donut
[naver-clova-ix/donut-base](https://huggingface.co/naver-clova-ix/donut-base) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
image_size (`int`, *optional*, defaults to 224):
The size (resolution) of each image.
patch_size (`int`, *optional*, defaults to 4):
The size (resolution) of each patch.
num_channels (`int`, *optional*, defaults to 3):
The number of input channels.
embed_dim (`int`, *optional*, defaults to 96):
Dimensionality of patch embedding.
depths (`list(int)`, *optional*, defaults to `[2, 2, 6, 2]`):
Depth of each layer in the Transformer encoder.
num_heads (`list(int)`, *optional*, defaults to `[3, 6, 12, 24]`):
Number of attention heads in each layer of the Transformer encoder.
window_size (`int`, *optional*, defaults to 7):
Size of windows.
mlp_ratio (`float`, *optional*, defaults to 4.0):
Ratio of MLP hidden dimensionality to embedding dimensionality.
qkv_bias (`bool`, *optional*, defaults to `True`):
Whether or not a learnable bias should be added to the queries, keys and values.
hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
The dropout probability for all fully connected layers in the embeddings and encoder.
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
drop_path_rate (`float`, *optional*, defaults to 0.1):
Stochastic depth rate.
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
`"selu"` and `"gelu_new"` are supported.
use_absolute_embeddings (`bool`, *optional*, defaults to `False`):
Whether or not to add absolute position embeddings to the patch embeddings.
initializer_range (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (`float`, *optional*, defaults to 1e-05):
The epsilon used by the layer normalization layers.
Example:
```python
>>> from transformers import UnimerSwinConfig, UnimerSwinModel
>>> # Initializing a Donut naver-clova-ix/donut-base style configuration
>>> configuration = UnimerSwinConfig()
>>> # Randomly initializing a model from the naver-clova-ix/donut-base style configuration
>>> model = UnimerSwinModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "unimer-swin"
attribute_map = {
"num_attention_heads": "num_heads",
"num_hidden_layers": "num_layers",
}
def __init__(
self,
image_size=224,
patch_size=4,
num_channels=3,
embed_dim=96,
depths=[2, 2, 6, 2],
num_heads=[3, 6, 12, 24],
window_size=7,
mlp_ratio=4.0,
qkv_bias=True,
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
drop_path_rate=0.1,
hidden_act="gelu",
use_absolute_embeddings=False,
initializer_range=0.02,
layer_norm_eps=1e-5,
**kwargs,
):
super().__init__(**kwargs)
self.image_size = image_size
self.patch_size = patch_size
self.num_channels = num_channels
self.embed_dim = embed_dim
self.depths = depths
self.num_layers = len(depths)
self.num_heads = num_heads
self.window_size = window_size
self.mlp_ratio = mlp_ratio
self.qkv_bias = qkv_bias
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.drop_path_rate = drop_path_rate
self.hidden_act = hidden_act
self.use_absolute_embeddings = use_absolute_embeddings
self.layer_norm_eps = layer_norm_eps
self.initializer_range = initializer_range
# we set the hidden_size attribute in order to make Swin work with VisionEncoderDecoderModel
# this indicates the channel dimension after the last stage of the model
self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
from transformers.image_processing_utils import BaseImageProcessor
from PIL import Image, ImageOps
import numpy as np
import cv2
import albumentations as alb
from albumentations.pytorch import ToTensorV2
from torchvision.transforms.functional import resize
# TODO: dereference cv2 if possible
class UnimerSwinImageProcessor(BaseImageProcessor):
def __init__(
self,
image_size = [192, 672],
):
self.input_size = [int(_) for _ in image_size]
assert len(self.input_size) == 2
self.transform = alb.Compose(
[
alb.ToGray(always_apply=True),
alb.Normalize((0.7931, 0.7931, 0.7931), (0.1738, 0.1738, 0.1738)),
# alb.Sharpen()
ToTensorV2(),
]
)
def __call__(self, item):
image = self.prepare_input(item)
return self.transform(image=np.array(image))['image'][:1]
@staticmethod
def crop_margin(img: Image.Image) -> Image.Image:
data = np.array(img.convert("L"))
data = data.astype(np.uint8)
max_val = data.max()
min_val = data.min()
if max_val == min_val:
return img
data = (data - min_val) / (max_val - min_val) * 255
gray = 255 * (data < 200).astype(np.uint8)
coords = cv2.findNonZero(gray) # Find all non-zero points (text)
a, b, w, h = cv2.boundingRect(coords) # Find minimum spanning bounding box
return img.crop((a, b, w + a, h + b))
def prepare_input(self, img: Image.Image, random_padding: bool = False):
"""
Convert PIL Image to tensor according to specified input_size after following steps below:
- resize
- rotate (if align_long_axis is True and image is not aligned longer axis with canvas)
- pad
"""
if img is None:
return
# crop margins
try:
img = self.crop_margin(img.convert("RGB"))
except OSError:
# might throw an error for broken files
return
if img.height == 0 or img.width == 0:
return
img = resize(img, min(self.input_size))
img.thumbnail((self.input_size[1], self.input_size[0]))
delta_width = self.input_size[1] - img.width
delta_height = self.input_size[0] - img.height
if random_padding:
pad_width = np.random.randint(low=0, high=delta_width + 1)
pad_height = np.random.randint(low=0, high=delta_height + 1)
else:
pad_width = delta_width // 2
pad_height = delta_height // 2
padding = (
pad_width,
pad_height,
delta_width - pad_width,
delta_height - pad_height,
)
return ImageOps.expand(img, padding)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment