Commit bd927919 authored by myhloli's avatar myhloli
Browse files

refactor: rename init file and update app.py to enable parsing method

parent f5016508
# coding=utf-8
from transformers.models.bert.configuration_bert import BertConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
LAYOUTLMV3_PRETRAINED_CONFIG_ARCHIVE_MAP = {
"layoutlmv3-base": "https://huggingface.co/microsoft/layoutlmv3-base/resolve/main/config.json",
"layoutlmv3-large": "https://huggingface.co/microsoft/layoutlmv3-large/resolve/main/config.json",
# See all LayoutLMv3 models at https://huggingface.co/models?filter=layoutlmv3
}
class LayoutLMv3Config(BertConfig):
model_type = "layoutlmv3"
def __init__(
self,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
max_2d_position_embeddings=1024,
coordinate_size=None,
shape_size=None,
has_relative_attention_bias=False,
rel_pos_bins=32,
max_rel_pos=128,
has_spatial_attention_bias=False,
rel_2d_pos_bins=64,
max_rel_2d_pos=256,
visual_embed=True,
mim=False,
wpa_task=False,
discrete_vae_weight_path='',
discrete_vae_type='dall-e',
input_size=224,
second_input_size=112,
device='cuda',
**kwargs
):
"""Constructs RobertaConfig."""
super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
self.max_2d_position_embeddings = max_2d_position_embeddings
self.coordinate_size = coordinate_size
self.shape_size = shape_size
self.has_relative_attention_bias = has_relative_attention_bias
self.rel_pos_bins = rel_pos_bins
self.max_rel_pos = max_rel_pos
self.has_spatial_attention_bias = has_spatial_attention_bias
self.rel_2d_pos_bins = rel_2d_pos_bins
self.max_rel_2d_pos = max_rel_2d_pos
self.visual_embed = visual_embed
self.mim = mim
self.wpa_task = wpa_task
self.discrete_vae_weight_path = discrete_vae_weight_path
self.discrete_vae_type = discrete_vae_type
self.input_size = input_size
self.second_input_size = second_input_size
self.device = device
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch LayoutLMv3 model. """
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.checkpoint
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from transformers import apply_chunking_to_forward
from transformers.modeling_outputs import (
BaseModelOutputWithPastAndCrossAttentions,
BaseModelOutputWithPoolingAndCrossAttentions,
MaskedLMOutput,
TokenClassifierOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
)
from transformers.modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
from transformers.models.roberta.modeling_roberta import (
RobertaIntermediate,
RobertaLMHead,
RobertaOutput,
RobertaSelfOutput,
)
from transformers.utils import logging
from .configuration_layoutlmv3 import LayoutLMv3Config
from timm.models.layers import to_2tuple
logger = logging.get_logger(__name__)
class PatchEmbed(nn.Module):
""" Image to Patch Embedding
"""
def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
super().__init__()
img_size = to_2tuple(img_size)
patch_size = to_2tuple(patch_size)
self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
# The following variables are used in detection mycheckpointer.py
self.num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
self.num_patches_w = self.patch_shape[0]
self.num_patches_h = self.patch_shape[1]
def forward(self, x, position_embedding=None):
x = self.proj(x)
if position_embedding is not None:
# interpolate the position embedding to the corresponding size
position_embedding = position_embedding.view(1, self.patch_shape[0], self.patch_shape[1], -1).permute(0, 3, 1, 2)
Hp, Wp = x.shape[2], x.shape[3]
position_embedding = F.interpolate(position_embedding, size=(Hp, Wp), mode='bicubic')
x = x + position_embedding
x = x.flatten(2).transpose(1, 2)
return x
class LayoutLMv3Embeddings(nn.Module):
"""
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
"""
# Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
def __init__(self, config):
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
# position_ids (1, len position emb) is contiguous in memory and exported when serialized
self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
# End copy
self.padding_idx = config.pad_token_id
self.position_embeddings = nn.Embedding(
config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
)
self.x_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size)
self.y_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size)
self.h_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.shape_size)
self.w_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.shape_size)
def _calc_spatial_position_embeddings(self, bbox):
try:
assert torch.all(0 <= bbox) and torch.all(bbox <= 1023)
left_position_embeddings = self.x_position_embeddings(bbox[:, :, 0])
upper_position_embeddings = self.y_position_embeddings(bbox[:, :, 1])
right_position_embeddings = self.x_position_embeddings(bbox[:, :, 2])
lower_position_embeddings = self.y_position_embeddings(bbox[:, :, 3])
except IndexError as e:
raise IndexError("The :obj:`bbox` coordinate values should be within 0-1000 range.") from e
h_position_embeddings = self.h_position_embeddings(torch.clip(bbox[:, :, 3] - bbox[:, :, 1], 0, 1023))
w_position_embeddings = self.w_position_embeddings(torch.clip(bbox[:, :, 2] - bbox[:, :, 0], 0, 1023))
# below is the difference between LayoutLMEmbeddingsV2 (torch.cat) and LayoutLMEmbeddingsV1 (add)
spatial_position_embeddings = torch.cat(
[
left_position_embeddings,
upper_position_embeddings,
right_position_embeddings,
lower_position_embeddings,
h_position_embeddings,
w_position_embeddings,
],
dim=-1,
)
return spatial_position_embeddings
def create_position_ids_from_input_ids(self, input_ids, padding_idx, past_key_values_length=0):
"""
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.
Args:
x: torch.Tensor x:
Returns: torch.Tensor
"""
# The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
mask = input_ids.ne(padding_idx).int()
incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
return incremental_indices.long() + padding_idx
def forward(
self,
input_ids=None,
bbox=None,
token_type_ids=None,
position_ids=None,
inputs_embeds=None,
past_key_values_length=0,
):
if position_ids is None:
if input_ids is not None:
# Create the position ids from the input token ids. Any padded tokens remain padded.
position_ids = self.create_position_ids_from_input_ids(
input_ids, self.padding_idx, past_key_values_length).to(input_ids.device)
else:
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
if input_ids is not None:
input_shape = input_ids.size()
else:
input_shape = inputs_embeds.size()[:-1]
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + token_type_embeddings
position_embeddings = self.position_embeddings(position_ids)
embeddings += position_embeddings
spatial_position_embeddings = self._calc_spatial_position_embeddings(bbox)
embeddings = embeddings + spatial_position_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
def create_position_ids_from_inputs_embeds(self, inputs_embeds):
"""
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
Args:
inputs_embeds: torch.Tensor≈
Returns: torch.Tensor
"""
input_shape = inputs_embeds.size()[:-1]
sequence_length = input_shape[1]
position_ids = torch.arange(
self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
)
return position_ids.unsqueeze(0).expand(input_shape)
class LayoutLMv3PreTrainedModel(PreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
config_class = LayoutLMv3Config
base_model_prefix = "layoutlmv3"
# Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, nn.Linear):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
elif isinstance(module, nn.LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
class LayoutLMv3SelfAttention(nn.Module):
def __init__(self, config):
super().__init__()
if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
raise ValueError(
f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
f"heads ({config.num_attention_heads})"
)
self.num_attention_heads = config.num_attention_heads
self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size)
self.key = nn.Linear(config.hidden_size, self.all_head_size)
self.value = nn.Linear(config.hidden_size, self.all_head_size)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
self.has_relative_attention_bias = config.has_relative_attention_bias
self.has_spatial_attention_bias = config.has_spatial_attention_bias
def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)
def cogview_attn(self, attention_scores, alpha=32):
'''
https://arxiv.org/pdf/2105.13290.pdf
Section 2.4 Stabilization of training: Precision Bottleneck Relaxation (PB-Relax).
A replacement of the original nn.Softmax(dim=-1)(attention_scores)
Seems the new attention_probs will result in a slower speed and a little bias
Can use torch.allclose(standard_attention_probs, cogview_attention_probs, atol=1e-08) for comparison
The smaller atol (e.g., 1e-08), the better.
'''
scaled_attention_scores = attention_scores / alpha
max_value = scaled_attention_scores.amax(dim=(-1)).unsqueeze(-1)
# max_value = scaled_attention_scores.amax(dim=(-2, -1)).unsqueeze(-1).unsqueeze(-1)
new_attention_scores = (scaled_attention_scores - max_value) * alpha
return nn.Softmax(dim=-1)(new_attention_scores)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value=None,
output_attentions=False,
rel_pos=None,
rel_2d_pos=None,
):
mixed_query_layer = self.query(hidden_states)
# If this is instantiated as a cross-attention module, the keys
# and values come from an encoder; the attention mask needs to be
# such that the encoder's padding tokens are not attended to.
is_cross_attention = encoder_hidden_states is not None
if is_cross_attention and past_key_value is not None:
# reuse k,v, cross_attentions
key_layer = past_key_value[0]
value_layer = past_key_value[1]
attention_mask = encoder_attention_mask
elif is_cross_attention:
key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
attention_mask = encoder_attention_mask
elif past_key_value is not None:
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
else:
key_layer = self.transpose_for_scores(self.key(hidden_states))
value_layer = self.transpose_for_scores(self.value(hidden_states))
query_layer = self.transpose_for_scores(mixed_query_layer)
# Take the dot product between "query" and "key" to get the raw attention scores.
# The attention scores QT K/√d could be significantly larger than input elements, and result in overflow.
# Changing the computational order into QT(K/√d) alleviates the problem. (https://arxiv.org/pdf/2105.13290.pdf)
attention_scores = torch.matmul(query_layer / math.sqrt(self.attention_head_size), key_layer.transpose(-1, -2))
if self.has_relative_attention_bias and self.has_spatial_attention_bias:
attention_scores += (rel_pos + rel_2d_pos) / math.sqrt(self.attention_head_size)
elif self.has_relative_attention_bias:
attention_scores += rel_pos / math.sqrt(self.attention_head_size)
# if self.has_relative_attention_bias:
# attention_scores += rel_pos
# if self.has_spatial_attention_bias:
# attention_scores += rel_2d_pos
# attention_scores = attention_scores / math.sqrt(self.attention_head_size)
if attention_mask is not None:
# Apply the attention mask is (precomputed for all layers in RobertaModel forward() function)
attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities.
# attention_probs = nn.Softmax(dim=-1)(attention_scores) # comment the line below and use this line for speedup
attention_probs = self.cogview_attn(attention_scores) # to stablize training
# assert torch.allclose(attention_probs, nn.Softmax(dim=-1)(attention_scores), atol=1e-8)
# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs)
# Mask heads if we want to
if head_mask is not None:
attention_probs = attention_probs * head_mask
context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(*new_context_layer_shape)
outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
return outputs
class LayoutLMv3Attention(nn.Module):
def __init__(self, config):
super().__init__()
self.self = LayoutLMv3SelfAttention(config)
self.output = RobertaSelfOutput(config)
self.pruned_heads = set()
def prune_heads(self, heads):
if len(heads) == 0:
return
heads, index = find_pruneable_heads_and_indices(
heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
)
# Prune linear layers
self.self.query = prune_linear_layer(self.self.query, index)
self.self.key = prune_linear_layer(self.self.key, index)
self.self.value = prune_linear_layer(self.self.value, index)
self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
# Update hyper params and store pruned heads
self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value=None,
output_attentions=False,
rel_pos=None,
rel_2d_pos=None,
):
self_outputs = self.self(
hidden_states,
attention_mask,
head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
rel_pos=rel_pos,
rel_2d_pos=rel_2d_pos,
)
attention_output = self.output(self_outputs[0], hidden_states)
outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
return outputs
class LayoutLMv3Layer(nn.Module):
def __init__(self, config):
super().__init__()
self.chunk_size_feed_forward = config.chunk_size_feed_forward
self.seq_len_dim = 1
self.attention = LayoutLMv3Attention(config)
assert not config.is_decoder and not config.add_cross_attention, \
"This version do not support decoder. Please refer to RoBERTa for implementation of is_decoder."
self.intermediate = RobertaIntermediate(config)
self.output = RobertaOutput(config)
def forward(
self,
hidden_states,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_value=None,
output_attentions=False,
rel_pos=None,
rel_2d_pos=None,
):
# decoder uni-directional self-attention cached key/values tuple is at positions 1,2
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
self_attention_outputs = self.attention(
hidden_states,
attention_mask,
head_mask,
output_attentions=output_attentions,
past_key_value=self_attn_past_key_value,
rel_pos=rel_pos,
rel_2d_pos=rel_2d_pos,
)
attention_output = self_attention_outputs[0]
outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
layer_output = apply_chunking_to_forward(
self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
)
outputs = (layer_output,) + outputs
return outputs
def feed_forward_chunk(self, attention_output):
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output
class LayoutLMv3Encoder(nn.Module):
def __init__(self, config, detection=False, out_features=None):
super().__init__()
self.config = config
self.detection = detection
self.layer = nn.ModuleList([LayoutLMv3Layer(config) for _ in range(config.num_hidden_layers)])
self.gradient_checkpointing = False
self.has_relative_attention_bias = config.has_relative_attention_bias
self.has_spatial_attention_bias = config.has_spatial_attention_bias
if self.has_relative_attention_bias:
self.rel_pos_bins = config.rel_pos_bins
self.max_rel_pos = config.max_rel_pos
self.rel_pos_onehot_size = config.rel_pos_bins
self.rel_pos_bias = nn.Linear(self.rel_pos_onehot_size, config.num_attention_heads, bias=False)
if self.has_spatial_attention_bias:
self.max_rel_2d_pos = config.max_rel_2d_pos
self.rel_2d_pos_bins = config.rel_2d_pos_bins
self.rel_2d_pos_onehot_size = config.rel_2d_pos_bins
self.rel_pos_x_bias = nn.Linear(self.rel_2d_pos_onehot_size, config.num_attention_heads, bias=False)
self.rel_pos_y_bias = nn.Linear(self.rel_2d_pos_onehot_size, config.num_attention_heads, bias=False)
if self.detection:
self.gradient_checkpointing = True
embed_dim = self.config.hidden_size
self.out_features = out_features
self.out_indices = [int(name[5:]) for name in out_features]
self.fpn1 = nn.Sequential(
nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
# nn.SyncBatchNorm(embed_dim),
nn.BatchNorm2d(embed_dim),
nn.GELU(),
nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
)
self.fpn2 = nn.Sequential(
nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
)
self.fpn3 = nn.Identity()
self.fpn4 = nn.MaxPool2d(kernel_size=2, stride=2)
self.ops = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
def relative_position_bucket(self, relative_position, bidirectional=True, num_buckets=32, max_distance=128):
ret = 0
if bidirectional:
num_buckets //= 2
ret += (relative_position > 0).long() * num_buckets
n = torch.abs(relative_position)
else:
n = torch.max(-relative_position, torch.zeros_like(relative_position))
# now n is in the range [0, inf)
# half of the buckets are for exact increments in positions
max_exact = num_buckets // 2
is_small = n < max_exact
# The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
val_if_large = max_exact + (
torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
).to(torch.long)
val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
ret += torch.where(is_small, n, val_if_large)
return ret
def _cal_1d_pos_emb(self, hidden_states, position_ids, valid_span):
VISUAL_NUM = 196 + 1
rel_pos_mat = position_ids.unsqueeze(-2) - position_ids.unsqueeze(-1)
if valid_span is not None:
# for the text part, if two words are not in the same line,
# set their distance to the max value (position_ids.shape[-1])
rel_pos_mat[(rel_pos_mat > 0) & (valid_span == False)] = position_ids.shape[1]
rel_pos_mat[(rel_pos_mat < 0) & (valid_span == False)] = -position_ids.shape[1]
# image-text, minimum distance
rel_pos_mat[:, -VISUAL_NUM:, :-VISUAL_NUM] = 0
rel_pos_mat[:, :-VISUAL_NUM, -VISUAL_NUM:] = 0
rel_pos = self.relative_position_bucket(
rel_pos_mat,
num_buckets=self.rel_pos_bins,
max_distance=self.max_rel_pos,
)
rel_pos = F.one_hot(rel_pos, num_classes=self.rel_pos_onehot_size).type_as(hidden_states)
rel_pos = self.rel_pos_bias(rel_pos).permute(0, 3, 1, 2)
rel_pos = rel_pos.contiguous()
return rel_pos
def _cal_2d_pos_emb(self, hidden_states, bbox):
position_coord_x = bbox[:, :, 0]
position_coord_y = bbox[:, :, 3]
rel_pos_x_2d_mat = position_coord_x.unsqueeze(-2) - position_coord_x.unsqueeze(-1)
rel_pos_y_2d_mat = position_coord_y.unsqueeze(-2) - position_coord_y.unsqueeze(-1)
rel_pos_x = self.relative_position_bucket(
rel_pos_x_2d_mat,
num_buckets=self.rel_2d_pos_bins,
max_distance=self.max_rel_2d_pos,
)
rel_pos_y = self.relative_position_bucket(
rel_pos_y_2d_mat,
num_buckets=self.rel_2d_pos_bins,
max_distance=self.max_rel_2d_pos,
)
rel_pos_x = F.one_hot(rel_pos_x, num_classes=self.rel_2d_pos_onehot_size).type_as(hidden_states)
rel_pos_y = F.one_hot(rel_pos_y, num_classes=self.rel_2d_pos_onehot_size).type_as(hidden_states)
rel_pos_x = self.rel_pos_x_bias(rel_pos_x).permute(0, 3, 1, 2)
rel_pos_y = self.rel_pos_y_bias(rel_pos_y).permute(0, 3, 1, 2)
rel_pos_x = rel_pos_x.contiguous()
rel_pos_y = rel_pos_y.contiguous()
rel_2d_pos = rel_pos_x + rel_pos_y
return rel_2d_pos
def forward(
self,
hidden_states,
bbox=None,
attention_mask=None,
head_mask=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_values=None,
use_cache=None,
output_attentions=False,
output_hidden_states=False,
return_dict=True,
position_ids=None,
Hp=None,
Wp=None,
valid_span=None,
):
all_hidden_states = () if output_hidden_states else None
all_self_attentions = () if output_attentions else None
all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
next_decoder_cache = () if use_cache else None
rel_pos = self._cal_1d_pos_emb(hidden_states, position_ids, valid_span) if self.has_relative_attention_bias else None
rel_2d_pos = self._cal_2d_pos_emb(hidden_states, bbox) if self.has_spatial_attention_bias else None
if self.detection:
feat_out = {}
j = 0
for i, layer_module in enumerate(self.layer):
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
layer_head_mask = head_mask[i] if head_mask is not None else None
past_key_value = past_key_values[i] if past_key_values is not None else None
if self.gradient_checkpointing and self.training:
if use_cache:
logger.warning(
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
)
use_cache = False
def create_custom_forward(module):
def custom_forward(*inputs):
return module(*inputs)
# return module(*inputs, past_key_value, output_attentions, rel_pos, rel_2d_pos)
# The above line will cause error:
# RuntimeError: Trying to backward through the graph a second time
# (or directly access saved tensors after they have already been freed).
return custom_forward
layer_outputs = torch.utils.checkpoint.checkpoint(
create_custom_forward(layer_module),
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
rel_pos,
rel_2d_pos
)
else:
layer_outputs = layer_module(
hidden_states,
attention_mask,
layer_head_mask,
encoder_hidden_states,
encoder_attention_mask,
past_key_value,
output_attentions,
rel_pos=rel_pos,
rel_2d_pos=rel_2d_pos,
)
hidden_states = layer_outputs[0]
if use_cache:
next_decoder_cache += (layer_outputs[-1],)
if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],)
if self.config.add_cross_attention:
all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
if self.detection and i in self.out_indices:
xp = hidden_states[:, -Hp*Wp:, :].permute(0, 2, 1).reshape(len(hidden_states), -1, Hp, Wp)
feat_out[self.out_features[j]] = self.ops[j](xp.contiguous())
j += 1
if self.detection:
return feat_out
if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,)
if not return_dict:
return tuple(
v
for v in [
hidden_states,
next_decoder_cache,
all_hidden_states,
all_self_attentions,
all_cross_attentions,
]
if v is not None
)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=next_decoder_cache,
hidden_states=all_hidden_states,
attentions=all_self_attentions,
cross_attentions=all_cross_attentions,
)
class LayoutLMv3Model(LayoutLMv3PreTrainedModel):
"""
"""
_keys_to_ignore_on_load_missing = [r"position_ids"]
# Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Roberta
def __init__(self, config, detection=False, out_features=None, image_only=False):
super().__init__(config)
self.config = config
assert not config.is_decoder and not config.add_cross_attention, \
"This version do not support decoder. Please refer to RoBERTa for implementation of is_decoder."
self.detection = detection
if not self.detection:
self.image_only = False
else:
assert config.visual_embed
self.image_only = image_only
if not self.image_only:
self.embeddings = LayoutLMv3Embeddings(config)
self.encoder = LayoutLMv3Encoder(config, detection=detection, out_features=out_features)
if config.visual_embed:
embed_dim = self.config.hidden_size
# use the default pre-training parameters for fine-tuning (e.g., input_size)
# when the input_size is larger in fine-tuning, we will interpolate the position embedding in forward
self.patch_embed = PatchEmbed(embed_dim=embed_dim)
patch_size = 16
size = int(self.config.input_size / patch_size)
self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
self.pos_embed = nn.Parameter(torch.zeros(1, size * size + 1, embed_dim))
self.pos_drop = nn.Dropout(p=0.)
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
if self.config.has_relative_attention_bias or self.config.has_spatial_attention_bias:
self._init_visual_bbox(img_size=(size, size))
from functools import partial
norm_layer = partial(nn.LayerNorm, eps=1e-6)
self.norm = norm_layer(embed_dim)
self.init_weights()
def get_input_embeddings(self):
return self.embeddings.word_embeddings
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def _prune_heads(self, heads_to_prune):
"""
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
"""
for layer, heads in heads_to_prune.items():
self.encoder.layer[layer].attention.prune_heads(heads)
def _init_visual_bbox(self, img_size=(14, 14), max_len=1000):
visual_bbox_x = torch.div(torch.arange(0, max_len * (img_size[1] + 1), max_len),
img_size[1], rounding_mode='trunc')
visual_bbox_y = torch.div(torch.arange(0, max_len * (img_size[0] + 1), max_len),
img_size[0], rounding_mode='trunc')
visual_bbox = torch.stack(
[
visual_bbox_x[:-1].repeat(img_size[0], 1),
visual_bbox_y[:-1].repeat(img_size[1], 1).transpose(0, 1),
visual_bbox_x[1:].repeat(img_size[0], 1),
visual_bbox_y[1:].repeat(img_size[1], 1).transpose(0, 1),
],
dim=-1,
).view(-1, 4)
cls_token_box = torch.tensor([[0 + 1, 0 + 1, max_len - 1, max_len - 1]])
self.visual_bbox = torch.cat([cls_token_box, visual_bbox], dim=0)
def _calc_visual_bbox(self, device, dtype, bsz): # , img_size=(14, 14), max_len=1000):
visual_bbox = self.visual_bbox.repeat(bsz, 1, 1)
visual_bbox = visual_bbox.to(device).type(dtype)
return visual_bbox
def forward_image(self, x):
if self.detection:
x = self.patch_embed(x, self.pos_embed[:, 1:, :] if self.pos_embed is not None else None)
else:
x = self.patch_embed(x)
batch_size, seq_len, _ = x.size()
cls_tokens = self.cls_token.expand(batch_size, -1, -1) # stole cls_tokens impl from Phil Wang, thanks
if self.pos_embed is not None and self.detection:
cls_tokens = cls_tokens + self.pos_embed[:, :1, :]
x = torch.cat((cls_tokens, x), dim=1)
if self.pos_embed is not None and not self.detection:
x = x + self.pos_embed
x = self.pos_drop(x)
x = self.norm(x)
return x
# Copied from transformers.models.bert.modeling_bert.BertModel.forward
def forward(
self,
input_ids=None,
bbox=None,
attention_mask=None,
token_type_ids=None,
valid_span=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
encoder_hidden_states=None,
encoder_attention_mask=None,
past_key_values=None,
use_cache=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
images=None,
):
r"""
encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
(those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
use_cache (:obj:`bool`, `optional`):
If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
decoding (see :obj:`past_key_values`).
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
use_cache = False
# if input_ids is not None and inputs_embeds is not None:
# raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
if input_ids is not None:
input_shape = input_ids.size()
batch_size, seq_length = input_shape
device = input_ids.device
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
batch_size, seq_length = input_shape
device = inputs_embeds.device
elif images is not None:
batch_size = len(images)
device = images.device
else:
raise ValueError("You have to specify either input_ids or inputs_embeds or images")
if not self.image_only:
# past_key_values_length
past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
if attention_mask is None:
attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
if token_type_ids is None:
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
# extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
encoder_extended_attention_mask = None
# Prepare head mask if needed
# 1.0 in head_mask indicate we keep the head
# attention_probs has shape bsz x n_heads x N x N
# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
if not self.image_only:
if bbox is None:
bbox = torch.zeros(tuple(list(input_shape) + [4]), dtype=torch.long, device=device)
embedding_output = self.embeddings(
input_ids=input_ids,
bbox=bbox,
position_ids=position_ids,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds,
past_key_values_length=past_key_values_length,
)
final_bbox = final_position_ids = None
Hp = Wp = None
if images is not None:
patch_size = 16
Hp, Wp = int(images.shape[2] / patch_size), int(images.shape[3] / patch_size)
visual_emb = self.forward_image(images)
if self.detection:
visual_attention_mask = torch.ones((batch_size, visual_emb.shape[1]), dtype=torch.long, device=device)
if self.image_only:
attention_mask = visual_attention_mask
else:
attention_mask = torch.cat([attention_mask, visual_attention_mask], dim=1)
elif self.image_only:
attention_mask = torch.ones((batch_size, visual_emb.shape[1]), dtype=torch.long, device=device)
if self.config.has_relative_attention_bias or self.config.has_spatial_attention_bias:
if self.config.has_spatial_attention_bias:
visual_bbox = self._calc_visual_bbox(device, dtype=torch.long, bsz=batch_size)
if self.image_only:
final_bbox = visual_bbox
else:
final_bbox = torch.cat([bbox, visual_bbox], dim=1)
visual_position_ids = torch.arange(0, visual_emb.shape[1], dtype=torch.long, device=device).repeat(
batch_size, 1)
if self.image_only:
final_position_ids = visual_position_ids
else:
position_ids = torch.arange(0, input_shape[1], device=device).unsqueeze(0)
position_ids = position_ids.expand_as(input_ids)
final_position_ids = torch.cat([position_ids, visual_position_ids], dim=1)
if self.image_only:
embedding_output = visual_emb
else:
embedding_output = torch.cat([embedding_output, visual_emb], dim=1)
embedding_output = self.LayerNorm(embedding_output)
embedding_output = self.dropout(embedding_output)
elif self.config.has_relative_attention_bias or self.config.has_spatial_attention_bias:
if self.config.has_spatial_attention_bias:
final_bbox = bbox
if self.config.has_relative_attention_bias:
position_ids = self.embeddings.position_ids[:, :input_shape[1]]
position_ids = position_ids.expand_as(input_ids)
final_position_ids = position_ids
extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, None, device)
encoder_outputs = self.encoder(
embedding_output,
bbox=final_bbox,
position_ids=final_position_ids,
attention_mask=extended_attention_mask,
head_mask=head_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_extended_attention_mask,
past_key_values=past_key_values,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
Hp=Hp,
Wp=Wp,
valid_span=valid_span,
)
if self.detection:
return encoder_outputs
sequence_output = encoder_outputs[0]
pooled_output = None
if not return_dict:
return (sequence_output, pooled_output) + encoder_outputs[1:]
return BaseModelOutputWithPoolingAndCrossAttentions(
last_hidden_state=sequence_output,
pooler_output=pooled_output,
past_key_values=encoder_outputs.past_key_values,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
cross_attentions=encoder_outputs.cross_attentions,
)
class LayoutLMv3ClassificationHead(nn.Module):
"""
Head for sentence-level classification tasks.
Reference: RobertaClassificationHead
"""
def __init__(self, config, pool_feature=False):
super().__init__()
self.pool_feature = pool_feature
if pool_feature:
self.dense = nn.Linear(config.hidden_size*3, config.hidden_size)
else:
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
classifier_dropout = (
config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
)
self.dropout = nn.Dropout(classifier_dropout)
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
def forward(self, x):
# x = features[:, 0, :] # take <s> token (equiv. to [CLS])
x = self.dropout(x)
x = self.dense(x)
x = torch.tanh(x)
x = self.dropout(x)
x = self.out_proj(x)
return x
class LayoutLMv3ForTokenClassification(LayoutLMv3PreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.layoutlmv3 = LayoutLMv3Model(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
if config.num_labels < 10:
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
else:
self.classifier = LayoutLMv3ClassificationHead(config, pool_feature=False)
self.init_weights()
def forward(
self,
input_ids=None,
bbox=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
valid_span=None,
head_mask=None,
inputs_embeds=None,
labels=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
images=None,
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
1]``.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.layoutlmv3(
input_ids,
bbox=bbox,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
images=images,
valid_span=valid_span,
)
sequence_output = outputs[0]
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
# Only keep active parts of the loss
if attention_mask is not None:
active_loss = attention_mask.view(-1) == 1
active_logits = logits.view(-1, self.num_labels)
active_labels = torch.where(
active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
)
loss = loss_fct(active_logits, active_labels)
else:
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return TokenClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
class LayoutLMv3ForQuestionAnswering(LayoutLMv3PreTrainedModel):
_keys_to_ignore_on_load_unexpected = [r"pooler"]
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.layoutlmv3 = LayoutLMv3Model(config)
# self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.qa_outputs = LayoutLMv3ClassificationHead(config, pool_feature=False)
self.init_weights()
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
valid_span=None,
head_mask=None,
inputs_embeds=None,
start_positions=None,
end_positions=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
bbox=None,
images=None,
):
r"""
start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
sequence are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.layoutlmv3(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
bbox=bbox,
images=images,
valid_span=valid_span,
)
sequence_output = outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1).contiguous()
end_logits = end_logits.squeeze(-1).contiguous()
total_loss = None
if start_positions is not None and end_positions is not None:
# If we are on multi-GPU, split add a dimension
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
# sometimes the start/end positions are outside our model inputs, we ignore these terms
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
if not return_dict:
output = (start_logits, end_logits) + outputs[2:]
return ((total_loss,) + output) if total_loss is not None else output
return QuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
class LayoutLMv3ForSequenceClassification(LayoutLMv3PreTrainedModel):
_keys_to_ignore_on_load_missing = [r"position_ids"]
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.config = config
self.layoutlmv3 = LayoutLMv3Model(config)
self.classifier = LayoutLMv3ClassificationHead(config, pool_feature=False)
self.init_weights()
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
valid_span=None,
head_mask=None,
inputs_embeds=None,
labels=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
bbox=None,
images=None,
):
r"""
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.layoutlmv3(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
position_ids=position_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
bbox=bbox,
images=images,
valid_span=valid_span,
)
sequence_output = outputs[0][:, 0, :]
logits = self.classifier(sequence_output)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return SequenceClassifierOutput(
loss=loss,
logits=logits,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
)
# coding=utf-8
# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization classes for LayoutLMv3, refer to RoBERTa."""
from transformers.models.roberta import RobertaTokenizer
from transformers.utils import logging
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
}
class LayoutLMv3Tokenizer(RobertaTokenizer):
vocab_files_names = VOCAB_FILES_NAMES
# pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
# max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
# coding=utf-8
# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Fast Tokenization classes for LayoutLMv3, refer to RoBERTa."""
from transformers.models.roberta.tokenization_roberta_fast import RobertaTokenizerFast
from transformers.utils import logging
from .tokenization_layoutlmv3 import LayoutLMv3Tokenizer
logger = logging.get_logger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
class LayoutLMv3TokenizerFast(RobertaTokenizerFast):
vocab_files_names = VOCAB_FILES_NAMES
# pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
# max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]
slow_tokenizer_class = LayoutLMv3Tokenizer
from .visualizer import Visualizer
from .rcnn_vl import *
from .backbone import *
from detectron2.config import get_cfg
from detectron2.config import CfgNode as CN
from detectron2.data import MetadataCatalog, DatasetCatalog
from detectron2.data.datasets import register_coco_instances
from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch, DefaultPredictor
def add_vit_config(cfg):
"""
Add config for VIT.
"""
_C = cfg
_C.MODEL.VIT = CN()
# CoaT model name.
_C.MODEL.VIT.NAME = ""
# Output features from CoaT backbone.
_C.MODEL.VIT.OUT_FEATURES = ["layer3", "layer5", "layer7", "layer11"]
_C.MODEL.VIT.IMG_SIZE = [224, 224]
_C.MODEL.VIT.POS_TYPE = "shared_rel"
_C.MODEL.VIT.DROP_PATH = 0.
_C.MODEL.VIT.MODEL_KWARGS = "{}"
_C.SOLVER.OPTIMIZER = "ADAMW"
_C.SOLVER.BACKBONE_MULTIPLIER = 1.0
_C.AUG = CN()
_C.AUG.DETR = False
_C.MODEL.IMAGE_ONLY = True
_C.PUBLAYNET_DATA_DIR_TRAIN = ""
_C.PUBLAYNET_DATA_DIR_TEST = ""
_C.FOOTNOTE_DATA_DIR_TRAIN = ""
_C.FOOTNOTE_DATA_DIR_VAL = ""
_C.SCIHUB_DATA_DIR_TRAIN = ""
_C.SCIHUB_DATA_DIR_TEST = ""
_C.JIAOCAI_DATA_DIR_TRAIN = ""
_C.JIAOCAI_DATA_DIR_TEST = ""
_C.ICDAR_DATA_DIR_TRAIN = ""
_C.ICDAR_DATA_DIR_TEST = ""
_C.M6DOC_DATA_DIR_TEST = ""
_C.DOCSTRUCTBENCH_DATA_DIR_TEST = ""
_C.DOCSTRUCTBENCHv2_DATA_DIR_TEST = ""
_C.CACHE_DIR = ""
_C.MODEL.CONFIG_PATH = ""
# effective update steps would be MAX_ITER/GRADIENT_ACCUMULATION_STEPS
# maybe need to set MAX_ITER *= GRADIENT_ACCUMULATION_STEPS
_C.SOLVER.GRADIENT_ACCUMULATION_STEPS = 1
def setup(args, device):
"""
Create configs and perform basic setups.
"""
cfg = get_cfg()
# add_coat_config(cfg)
add_vit_config(cfg)
cfg.merge_from_file(args.config_file)
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.2 # set threshold for this model
cfg.merge_from_list(args.opts)
# 使用统一的device配置
cfg.MODEL.DEVICE = device
cfg.freeze()
default_setup(cfg, args)
#@todo 可以删掉这块?
# register_coco_instances(
# "scihub_train",
# {},
# cfg.SCIHUB_DATA_DIR_TRAIN + ".json",
# cfg.SCIHUB_DATA_DIR_TRAIN
# )
return cfg
class DotDict(dict):
def __init__(self, *args, **kwargs):
super(DotDict, self).__init__(*args, **kwargs)
def __getattr__(self, key):
if key not in self.keys():
return None
value = self[key]
if isinstance(value, dict):
value = DotDict(value)
return value
def __setattr__(self, key, value):
self[key] = value
class Layoutlmv3_Predictor(object):
def __init__(self, weights, config_file, device):
layout_args = {
"config_file": config_file,
"resume": False,
"eval_only": False,
"num_gpus": 1,
"num_machines": 1,
"machine_rank": 0,
"dist_url": "tcp://127.0.0.1:57823",
"opts": ["MODEL.WEIGHTS", weights],
}
layout_args = DotDict(layout_args)
cfg = setup(layout_args, device)
self.mapping = ["title", "plain text", "abandon", "figure", "figure_caption", "table", "table_caption",
"table_footnote", "isolate_formula", "formula_caption"]
MetadataCatalog.get(cfg.DATASETS.TRAIN[0]).thing_classes = self.mapping
self.predictor = DefaultPredictor(cfg)
def __call__(self, image, ignore_catids=[]):
# page_layout_result = {
# "layout_dets": []
# }
layout_dets = []
outputs = self.predictor(image)
boxes = outputs["instances"].to("cpu")._fields["pred_boxes"].tensor.tolist()
labels = outputs["instances"].to("cpu")._fields["pred_classes"].tolist()
scores = outputs["instances"].to("cpu")._fields["scores"].tolist()
for bbox_idx in range(len(boxes)):
if labels[bbox_idx] in ignore_catids:
continue
layout_dets.append({
"category_id": labels[bbox_idx],
"poly": [
boxes[bbox_idx][0], boxes[bbox_idx][1],
boxes[bbox_idx][2], boxes[bbox_idx][1],
boxes[bbox_idx][2], boxes[bbox_idx][3],
boxes[bbox_idx][0], boxes[bbox_idx][3],
],
"score": scores[bbox_idx]
})
return layout_dets
# Copyright (c) Facebook, Inc. and its affiliates.
import logging
import numpy as np
from typing import Dict, List, Optional, Tuple
import torch
from torch import nn
from detectron2.config import configurable
from detectron2.structures import ImageList, Instances
from detectron2.utils.events import get_event_storage
from detectron2.modeling.backbone import Backbone, build_backbone
from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
from detectron2.modeling.meta_arch import GeneralizedRCNN
from detectron2.modeling.postprocessing import detector_postprocess
from detectron2.modeling.roi_heads.fast_rcnn import fast_rcnn_inference_single_image
from contextlib import contextmanager
from itertools import count
@META_ARCH_REGISTRY.register()
class VLGeneralizedRCNN(GeneralizedRCNN):
"""
Generalized R-CNN. Any models that contains the following three components:
1. Per-image feature extraction (aka backbone)
2. Region proposal generation
3. Per-region feature extraction and prediction
"""
def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):
"""
Args:
batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
Each item in the list contains the inputs for one image.
For now, each item in the list is a dict that contains:
* image: Tensor, image in (C, H, W) format.
* instances (optional): groundtruth :class:`Instances`
* proposals (optional): :class:`Instances`, precomputed proposals.
Other information that's included in the original dicts, such as:
* "height", "width" (int): the output resolution of the model, used in inference.
See :meth:`postprocess` for details.
Returns:
list[dict]:
Each dict is the output for one input image.
The dict contains one key "instances" whose value is a :class:`Instances`.
The :class:`Instances` object has the following keys:
"pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints"
"""
if not self.training:
return self.inference(batched_inputs)
images = self.preprocess_image(batched_inputs)
if "instances" in batched_inputs[0]:
gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
else:
gt_instances = None
# features = self.backbone(images.tensor)
input = self.get_batch(batched_inputs, images)
features = self.backbone(input)
if self.proposal_generator is not None:
proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
else:
assert "proposals" in batched_inputs[0]
proposals = [x["proposals"].to(self.device) for x in batched_inputs]
proposal_losses = {}
_, detector_losses = self.roi_heads(images, features, proposals, gt_instances)
if self.vis_period > 0:
storage = get_event_storage()
if storage.iter % self.vis_period == 0:
self.visualize_training(batched_inputs, proposals)
losses = {}
losses.update(detector_losses)
losses.update(proposal_losses)
return losses
def inference(
self,
batched_inputs: List[Dict[str, torch.Tensor]],
detected_instances: Optional[List[Instances]] = None,
do_postprocess: bool = True,
):
"""
Run inference on the given inputs.
Args:
batched_inputs (list[dict]): same as in :meth:`forward`
detected_instances (None or list[Instances]): if not None, it
contains an `Instances` object per image. The `Instances`
object contains "pred_boxes" and "pred_classes" which are
known boxes in the image.
The inference will then skip the detection of bounding boxes,
and only predict other per-ROI outputs.
do_postprocess (bool): whether to apply post-processing on the outputs.
Returns:
When do_postprocess=True, same as in :meth:`forward`.
Otherwise, a list[Instances] containing raw network outputs.
"""
assert not self.training
images = self.preprocess_image(batched_inputs)
# features = self.backbone(images.tensor)
input = self.get_batch(batched_inputs, images)
features = self.backbone(input)
if detected_instances is None:
if self.proposal_generator is not None:
proposals, _ = self.proposal_generator(images, features, None)
else:
assert "proposals" in batched_inputs[0]
proposals = [x["proposals"].to(self.device) for x in batched_inputs]
results, _ = self.roi_heads(images, features, proposals, None)
else:
detected_instances = [x.to(self.device) for x in detected_instances]
results = self.roi_heads.forward_with_given_boxes(features, detected_instances)
if do_postprocess:
assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess."
return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes)
else:
return results
def get_batch(self, examples, images):
if len(examples) >= 1 and "bbox" not in examples[0]: # image_only
return {"images": images.tensor}
return input
def _batch_inference(self, batched_inputs, detected_instances=None):
"""
Execute inference on a list of inputs,
using batch size = self.batch_size (e.g., 2), instead of the length of the list.
Inputs & outputs have the same format as :meth:`GeneralizedRCNN.inference`
"""
if detected_instances is None:
detected_instances = [None] * len(batched_inputs)
outputs = []
inputs, instances = [], []
for idx, input, instance in zip(count(), batched_inputs, detected_instances):
inputs.append(input)
instances.append(instance)
if len(inputs) == 2 or idx == len(batched_inputs) - 1:
outputs.extend(
self.inference(
inputs,
instances if instances[0] is not None else None,
do_postprocess=True, # False
)
)
inputs, instances = [], []
return outputs
# Copyright (c) Facebook, Inc. and its affiliates.
import colorsys
import logging
import math
import numpy as np
from enum import Enum, unique
import cv2
import matplotlib as mpl
import matplotlib.colors as mplc
import matplotlib.figure as mplfigure
import pycocotools.mask as mask_util
import torch
from matplotlib.backends.backend_agg import FigureCanvasAgg
from PIL import Image
from detectron2.data import MetadataCatalog
from detectron2.structures import BitMasks, Boxes, BoxMode, Keypoints, PolygonMasks, RotatedBoxes
from detectron2.utils.file_io import PathManager
from detectron2.utils.colormap import random_color
import pdb
logger = logging.getLogger(__name__)
__all__ = ["ColorMode", "VisImage", "Visualizer"]
_SMALL_OBJECT_AREA_THRESH = 1000
_LARGE_MASK_AREA_THRESH = 120000
_OFF_WHITE = (1.0, 1.0, 240.0 / 255)
_BLACK = (0, 0, 0)
_RED = (1.0, 0, 0)
_KEYPOINT_THRESHOLD = 0.05
#CLASS_NAMES = ["footnote", "footer", "header"]
@unique
class ColorMode(Enum):
"""
Enum of different color modes to use for instance visualizations.
"""
IMAGE = 0
"""
Picks a random color for every instance and overlay segmentations with low opacity.
"""
SEGMENTATION = 1
"""
Let instances of the same category have similar colors
(from metadata.thing_colors), and overlay them with
high opacity. This provides more attention on the quality of segmentation.
"""
IMAGE_BW = 2
"""
Same as IMAGE, but convert all areas without masks to gray-scale.
Only available for drawing per-instance mask predictions.
"""
class GenericMask:
"""
Attribute:
polygons (list[ndarray]): list[ndarray]: polygons for this mask.
Each ndarray has format [x, y, x, y, ...]
mask (ndarray): a binary mask
"""
def __init__(self, mask_or_polygons, height, width):
self._mask = self._polygons = self._has_holes = None
self.height = height
self.width = width
m = mask_or_polygons
if isinstance(m, dict):
# RLEs
assert "counts" in m and "size" in m
if isinstance(m["counts"], list): # uncompressed RLEs
h, w = m["size"]
assert h == height and w == width
m = mask_util.frPyObjects(m, h, w)
self._mask = mask_util.decode(m)[:, :]
return
if isinstance(m, list): # list[ndarray]
self._polygons = [np.asarray(x).reshape(-1) for x in m]
return
if isinstance(m, np.ndarray): # assumed to be a binary mask
assert m.shape[1] != 2, m.shape
assert m.shape == (
height,
width,
), f"mask shape: {m.shape}, target dims: {height}, {width}"
self._mask = m.astype("uint8")
return
raise ValueError("GenericMask cannot handle object {} of type '{}'".format(m, type(m)))
@property
def mask(self):
if self._mask is None:
self._mask = self.polygons_to_mask(self._polygons)
return self._mask
@property
def polygons(self):
if self._polygons is None:
self._polygons, self._has_holes = self.mask_to_polygons(self._mask)
return self._polygons
@property
def has_holes(self):
if self._has_holes is None:
if self._mask is not None:
self._polygons, self._has_holes = self.mask_to_polygons(self._mask)
else:
self._has_holes = False # if original format is polygon, does not have holes
return self._has_holes
def mask_to_polygons(self, mask):
# cv2.RETR_CCOMP flag retrieves all the contours and arranges them to a 2-level
# hierarchy. External contours (boundary) of the object are placed in hierarchy-1.
# Internal contours (holes) are placed in hierarchy-2.
# cv2.CHAIN_APPROX_NONE flag gets vertices of polygons from contours.
mask = np.ascontiguousarray(mask) # some versions of cv2 does not support incontiguous arr
res = cv2.findContours(mask.astype("uint8"), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE)
hierarchy = res[-1]
if hierarchy is None: # empty mask
return [], False
has_holes = (hierarchy.reshape(-1, 4)[:, 3] >= 0).sum() > 0
res = res[-2]
res = [x.flatten() for x in res]
# These coordinates from OpenCV are integers in range [0, W-1 or H-1].
# We add 0.5 to turn them into real-value coordinate space. A better solution
# would be to first +0.5 and then dilate the returned polygon by 0.5.
res = [x + 0.5 for x in res if len(x) >= 6]
return res, has_holes
def polygons_to_mask(self, polygons):
rle = mask_util.frPyObjects(polygons, self.height, self.width)
rle = mask_util.merge(rle)
return mask_util.decode(rle)[:, :]
def area(self):
return self.mask.sum()
def bbox(self):
p = mask_util.frPyObjects(self.polygons, self.height, self.width)
p = mask_util.merge(p)
bbox = mask_util.toBbox(p)
bbox[2] += bbox[0]
bbox[3] += bbox[1]
return bbox
class _PanopticPrediction:
"""
Unify different panoptic annotation/prediction formats
"""
def __init__(self, panoptic_seg, segments_info, metadata=None):
if segments_info is None:
assert metadata is not None
# If "segments_info" is None, we assume "panoptic_img" is a
# H*W int32 image storing the panoptic_id in the format of
# category_id * label_divisor + instance_id. We reserve -1 for
# VOID label.
label_divisor = metadata.label_divisor
segments_info = []
for panoptic_label in np.unique(panoptic_seg.numpy()):
if panoptic_label == -1:
# VOID region.
continue
pred_class = panoptic_label // label_divisor
isthing = pred_class in metadata.thing_dataset_id_to_contiguous_id.values()
segments_info.append(
{
"id": int(panoptic_label),
"category_id": int(pred_class),
"isthing": bool(isthing),
}
)
del metadata
self._seg = panoptic_seg
self._sinfo = {s["id"]: s for s in segments_info} # seg id -> seg info
segment_ids, areas = torch.unique(panoptic_seg, sorted=True, return_counts=True)
areas = areas.numpy()
sorted_idxs = np.argsort(-areas)
self._seg_ids, self._seg_areas = segment_ids[sorted_idxs], areas[sorted_idxs]
self._seg_ids = self._seg_ids.tolist()
for sid, area in zip(self._seg_ids, self._seg_areas):
if sid in self._sinfo:
self._sinfo[sid]["area"] = float(area)
def non_empty_mask(self):
"""
Returns:
(H, W) array, a mask for all pixels that have a prediction
"""
empty_ids = []
for id in self._seg_ids:
if id not in self._sinfo:
empty_ids.append(id)
if len(empty_ids) == 0:
return np.zeros(self._seg.shape, dtype=np.uint8)
assert (
len(empty_ids) == 1
), ">1 ids corresponds to no labels. This is currently not supported"
return (self._seg != empty_ids[0]).numpy().astype(np.bool)
def semantic_masks(self):
for sid in self._seg_ids:
sinfo = self._sinfo.get(sid)
if sinfo is None or sinfo["isthing"]:
# Some pixels (e.g. id 0 in PanopticFPN) have no instance or semantic predictions.
continue
yield (self._seg == sid).numpy().astype(np.bool), sinfo
def instance_masks(self):
for sid in self._seg_ids:
sinfo = self._sinfo.get(sid)
if sinfo is None or not sinfo["isthing"]:
continue
mask = (self._seg == sid).numpy().astype(np.bool)
if mask.sum() > 0:
yield mask, sinfo
def _create_text_labels(classes, scores, class_names, is_crowd=None):
"""
Args:
classes (list[int] or None):
scores (list[float] or None):
class_names (list[str] or None):
is_crowd (list[bool] or None):
Returns:
list[str] or None
"""
#class_names = CLASS_NAMES
labels = None
if classes is not None:
if class_names is not None and len(class_names) > 0:
labels = [class_names[i] for i in classes]
else:
labels = [str(i) for i in classes]
if scores is not None:
if labels is None:
labels = ["{:.0f}%".format(s * 100) for s in scores]
else:
labels = ["{} {:.0f}%".format(l, s * 100) for l, s in zip(labels, scores)]
if labels is not None and is_crowd is not None:
labels = [l + ("|crowd" if crowd else "") for l, crowd in zip(labels, is_crowd)]
return labels
class VisImage:
def __init__(self, img, scale=1.0):
"""
Args:
img (ndarray): an RGB image of shape (H, W, 3) in range [0, 255].
scale (float): scale the input image
"""
self.img = img
self.scale = scale
self.width, self.height = img.shape[1], img.shape[0]
self._setup_figure(img)
def _setup_figure(self, img):
"""
Args:
Same as in :meth:`__init__()`.
Returns:
fig (matplotlib.pyplot.figure): top level container for all the image plot elements.
ax (matplotlib.pyplot.Axes): contains figure elements and sets the coordinate system.
"""
fig = mplfigure.Figure(frameon=False)
self.dpi = fig.get_dpi()
# add a small 1e-2 to avoid precision lost due to matplotlib's truncation
# (https://github.com/matplotlib/matplotlib/issues/15363)
fig.set_size_inches(
(self.width * self.scale + 1e-2) / self.dpi,
(self.height * self.scale + 1e-2) / self.dpi,
)
self.canvas = FigureCanvasAgg(fig)
# self.canvas = mpl.backends.backend_cairo.FigureCanvasCairo(fig)
ax = fig.add_axes([0.0, 0.0, 1.0, 1.0])
ax.axis("off")
self.fig = fig
self.ax = ax
self.reset_image(img)
def reset_image(self, img):
"""
Args:
img: same as in __init__
"""
img = img.astype("uint8")
self.ax.imshow(img, extent=(0, self.width, self.height, 0), interpolation="nearest")
def save(self, filepath):
"""
Args:
filepath (str): a string that contains the absolute path, including the file name, where
the visualized image will be saved.
"""
self.fig.savefig(filepath)
def get_image(self):
"""
Returns:
ndarray:
the visualized image of shape (H, W, 3) (RGB) in uint8 type.
The shape is scaled w.r.t the input image using the given `scale` argument.
"""
canvas = self.canvas
s, (width, height) = canvas.print_to_buffer()
# buf = io.BytesIO() # works for cairo backend
# canvas.print_rgba(buf)
# width, height = self.width, self.height
# s = buf.getvalue()
buffer = np.frombuffer(s, dtype="uint8")
img_rgba = buffer.reshape(height, width, 4)
rgb, alpha = np.split(img_rgba, [3], axis=2)
return rgb.astype("uint8")
class Visualizer:
"""
Visualizer that draws data about detection/segmentation on images.
It contains methods like `draw_{text,box,circle,line,binary_mask,polygon}`
that draw primitive objects to images, as well as high-level wrappers like
`draw_{instance_predictions,sem_seg,panoptic_seg_predictions,dataset_dict}`
that draw composite data in some pre-defined style.
Note that the exact visualization style for the high-level wrappers are subject to change.
Style such as color, opacity, label contents, visibility of labels, or even the visibility
of objects themselves (e.g. when the object is too small) may change according
to different heuristics, as long as the results still look visually reasonable.
To obtain a consistent style, you can implement custom drawing functions with the
abovementioned primitive methods instead. If you need more customized visualization
styles, you can process the data yourself following their format documented in
tutorials (:doc:`/tutorials/models`, :doc:`/tutorials/datasets`). This class does not
intend to satisfy everyone's preference on drawing styles.
This visualizer focuses on high rendering quality rather than performance. It is not
designed to be used for real-time applications.
"""
# TODO implement a fast, rasterized version using OpenCV
def __init__(self, img_rgb, metadata=None, scale=1.0, instance_mode=ColorMode.IMAGE):
"""
Args:
img_rgb: a numpy array of shape (H, W, C), where H and W correspond to
the height and width of the image respectively. C is the number of
color channels. The image is required to be in RGB format since that
is a requirement of the Matplotlib library. The image is also expected
to be in the range [0, 255].
metadata (Metadata): dataset metadata (e.g. class names and colors)
instance_mode (ColorMode): defines one of the pre-defined style for drawing
instances on an image.
"""
self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
if metadata is None:
metadata = MetadataCatalog.get("__nonexist__")
self.metadata = metadata
self.output = VisImage(self.img, scale=scale)
self.cpu_device = torch.device("cpu")
# too small texts are useless, therefore clamp to 9
self._default_font_size = max(
np.sqrt(self.output.height * self.output.width) // 90, 10 // scale
)
self._instance_mode = instance_mode
self.keypoint_threshold = _KEYPOINT_THRESHOLD
def draw_instance_predictions(self, predictions):
"""
Draw instance-level prediction results on an image.
Args:
predictions (Instances): the output of an instance detection/segmentation
model. Following fields will be used to draw:
"pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle").
Returns:
output (VisImage): image object with visualizations.
"""
boxes = predictions.pred_boxes if predictions.has("pred_boxes") else None
scores = predictions.scores if predictions.has("scores") else None
classes = predictions.pred_classes.tolist() if predictions.has("pred_classes") else None
labels = _create_text_labels(classes, scores, self.metadata.get("thing_classes", None))
keypoints = predictions.pred_keypoints if predictions.has("pred_keypoints") else None
if predictions.has("pred_masks"):
masks = np.asarray(predictions.pred_masks)
masks = [GenericMask(x, self.output.height, self.output.width) for x in masks]
else:
masks = None
if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"):
colors = [
self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in classes
]
alpha = 0.8
else:
colors = None
alpha = 0.5
if self._instance_mode == ColorMode.IMAGE_BW:
self.output.reset_image(
self._create_grayscale_image(
(predictions.pred_masks.any(dim=0) > 0).numpy()
if predictions.has("pred_masks")
else None
)
)
alpha = 0.3
self.overlay_instances(
masks=masks,
boxes=boxes,
labels=labels,
keypoints=keypoints,
assigned_colors=colors,
alpha=alpha,
)
return self.output
def draw_sem_seg(self, sem_seg, area_threshold=None, alpha=0.8):
"""
Draw semantic segmentation predictions/labels.
Args:
sem_seg (Tensor or ndarray): the segmentation of shape (H, W).
Each value is the integer label of the pixel.
area_threshold (int): segments with less than `area_threshold` are not drawn.
alpha (float): the larger it is, the more opaque the segmentations are.
Returns:
output (VisImage): image object with visualizations.
"""
if isinstance(sem_seg, torch.Tensor):
sem_seg = sem_seg.numpy()
labels, areas = np.unique(sem_seg, return_counts=True)
sorted_idxs = np.argsort(-areas).tolist()
labels = labels[sorted_idxs]
for label in filter(lambda l: l < len(self.metadata.stuff_classes), labels):
try:
mask_color = [x / 255 for x in self.metadata.stuff_colors[label]]
except (AttributeError, IndexError):
mask_color = None
binary_mask = (sem_seg == label).astype(np.uint8)
text = self.metadata.stuff_classes[label]
self.draw_binary_mask(
binary_mask,
color=mask_color,
edge_color=_OFF_WHITE,
text=text,
alpha=alpha,
area_threshold=area_threshold,
)
return self.output
def draw_panoptic_seg(self, panoptic_seg, segments_info, area_threshold=None, alpha=0.7):
"""
Draw panoptic prediction annotations or results.
Args:
panoptic_seg (Tensor): of shape (height, width) where the values are ids for each
segment.
segments_info (list[dict] or None): Describe each segment in `panoptic_seg`.
If it is a ``list[dict]``, each dict contains keys "id", "category_id".
If None, category id of each pixel is computed by
``pixel // metadata.label_divisor``.
area_threshold (int): stuff segments with less than `area_threshold` are not drawn.
Returns:
output (VisImage): image object with visualizations.
"""
pred = _PanopticPrediction(panoptic_seg, segments_info, self.metadata)
if self._instance_mode == ColorMode.IMAGE_BW:
self.output.reset_image(self._create_grayscale_image(pred.non_empty_mask()))
# draw mask for all semantic segments first i.e. "stuff"
for mask, sinfo in pred.semantic_masks():
category_idx = sinfo["category_id"]
try:
mask_color = [x / 255 for x in self.metadata.stuff_colors[category_idx]]
except AttributeError:
mask_color = None
text = self.metadata.stuff_classes[category_idx]
self.draw_binary_mask(
mask,
color=mask_color,
edge_color=_OFF_WHITE,
text=text,
alpha=alpha,
area_threshold=area_threshold,
)
# draw mask for all instances second
all_instances = list(pred.instance_masks())
if len(all_instances) == 0:
return self.output
masks, sinfo = list(zip(*all_instances))
category_ids = [x["category_id"] for x in sinfo]
try:
scores = [x["score"] for x in sinfo]
except KeyError:
scores = None
labels = _create_text_labels(
category_ids, scores, self.metadata.thing_classes, [x.get("iscrowd", 0) for x in sinfo]
)
try:
colors = [
self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in category_ids
]
except AttributeError:
colors = None
self.overlay_instances(masks=masks, labels=labels, assigned_colors=colors, alpha=alpha)
return self.output
draw_panoptic_seg_predictions = draw_panoptic_seg # backward compatibility
def draw_dataset_dict(self, dic):
"""
Draw annotations/segmentaions in Detectron2 Dataset format.
Args:
dic (dict): annotation/segmentation data of one image, in Detectron2 Dataset format.
Returns:
output (VisImage): image object with visualizations.
"""
annos = dic.get("annotations", None)
if annos:
if "segmentation" in annos[0]:
masks = [x["segmentation"] for x in annos]
else:
masks = None
if "keypoints" in annos[0]:
keypts = [x["keypoints"] for x in annos]
keypts = np.array(keypts).reshape(len(annos), -1, 3)
else:
keypts = None
boxes = [
BoxMode.convert(x["bbox"], x["bbox_mode"], BoxMode.XYXY_ABS)
if len(x["bbox"]) == 4
else x["bbox"]
for x in annos
]
colors = None
category_ids = [x["category_id"] for x in annos]
if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"):
colors = [
self._jitter([x / 255 for x in self.metadata.thing_colors[c]])
for c in category_ids
]
names = self.metadata.get("thing_classes", None)
labels = _create_text_labels(
category_ids,
scores=None,
class_names=names,
is_crowd=[x.get("iscrowd", 0) for x in annos],
)
self.overlay_instances(
labels=labels, boxes=boxes, masks=masks, keypoints=keypts, assigned_colors=colors
)
sem_seg = dic.get("sem_seg", None)
if sem_seg is None and "sem_seg_file_name" in dic:
with PathManager.open(dic["sem_seg_file_name"], "rb") as f:
sem_seg = Image.open(f)
sem_seg = np.asarray(sem_seg, dtype="uint8")
if sem_seg is not None:
self.draw_sem_seg(sem_seg, area_threshold=0, alpha=0.5)
pan_seg = dic.get("pan_seg", None)
if pan_seg is None and "pan_seg_file_name" in dic:
with PathManager.open(dic["pan_seg_file_name"], "rb") as f:
pan_seg = Image.open(f)
pan_seg = np.asarray(pan_seg)
from panopticapi.utils import rgb2id
pan_seg = rgb2id(pan_seg)
if pan_seg is not None:
segments_info = dic["segments_info"]
pan_seg = torch.tensor(pan_seg)
self.draw_panoptic_seg(pan_seg, segments_info, area_threshold=0, alpha=0.5)
return self.output
def overlay_instances(
self,
*,
boxes=None,
labels=None,
masks=None,
keypoints=None,
assigned_colors=None,
alpha=0.5,
):
"""
Args:
boxes (Boxes, RotatedBoxes or ndarray): either a :class:`Boxes`,
or an Nx4 numpy array of XYXY_ABS format for the N objects in a single image,
or a :class:`RotatedBoxes`,
or an Nx5 numpy array of (x_center, y_center, width, height, angle_degrees) format
for the N objects in a single image,
labels (list[str]): the text to be displayed for each instance.
masks (masks-like object): Supported types are:
* :class:`detectron2.structures.PolygonMasks`,
:class:`detectron2.structures.BitMasks`.
* list[list[ndarray]]: contains the segmentation masks for all objects in one image.
The first level of the list corresponds to individual instances. The second
level to all the polygon that compose the instance, and the third level
to the polygon coordinates. The third level should have the format of
[x0, y0, x1, y1, ..., xn, yn] (n >= 3).
* list[ndarray]: each ndarray is a binary mask of shape (H, W).
* list[dict]: each dict is a COCO-style RLE.
keypoints (Keypoint or array like): an array-like object of shape (N, K, 3),
where the N is the number of instances and K is the number of keypoints.
The last dimension corresponds to (x, y, visibility or score).
assigned_colors (list[matplotlib.colors]): a list of colors, where each color
corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
for full list of formats that the colors are accepted in.
Returns:
output (VisImage): image object with visualizations.
"""
num_instances = 0
if boxes is not None:
boxes = self._convert_boxes(boxes)
num_instances = len(boxes)
if masks is not None:
masks = self._convert_masks(masks)
if num_instances:
assert len(masks) == num_instances
else:
num_instances = len(masks)
if keypoints is not None:
if num_instances:
assert len(keypoints) == num_instances
else:
num_instances = len(keypoints)
keypoints = self._convert_keypoints(keypoints)
if labels is not None:
assert len(labels) == num_instances
if assigned_colors is None:
assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)]
if num_instances == 0:
return self.output
if boxes is not None and boxes.shape[1] == 5:
return self.overlay_rotated_instances(
boxes=boxes, labels=labels, assigned_colors=assigned_colors
)
# Display in largest to smallest order to reduce occlusion.
areas = None
if boxes is not None:
areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1)
elif masks is not None:
areas = np.asarray([x.area() for x in masks])
if areas is not None:
sorted_idxs = np.argsort(-areas).tolist()
# Re-order overlapped instances in descending order.
boxes = boxes[sorted_idxs] if boxes is not None else None
labels = [labels[k] for k in sorted_idxs] if labels is not None else None
masks = [masks[idx] for idx in sorted_idxs] if masks is not None else None
assigned_colors = [assigned_colors[idx] for idx in sorted_idxs]
keypoints = keypoints[sorted_idxs] if keypoints is not None else None
for i in range(num_instances):
color = assigned_colors[i]
if boxes is not None:
self.draw_box(boxes[i], edge_color=color)
if masks is not None:
for segment in masks[i].polygons:
self.draw_polygon(segment.reshape(-1, 2), color, alpha=alpha)
if labels is not None:
# first get a box
if boxes is not None:
x0, y0, x1, y1 = boxes[i]
text_pos = (x0, y0) # if drawing boxes, put text on the box corner.
horiz_align = "left"
elif masks is not None:
# skip small mask without polygon
if len(masks[i].polygons) == 0:
continue
x0, y0, x1, y1 = masks[i].bbox()
# draw text in the center (defined by median) when box is not drawn
# median is less sensitive to outliers.
text_pos = np.median(masks[i].mask.nonzero(), axis=1)[::-1]
horiz_align = "center"
else:
continue # drawing the box confidence for keypoints isn't very useful.
# for small objects, draw text at the side to avoid occlusion
instance_area = (y1 - y0) * (x1 - x0)
if (
instance_area < _SMALL_OBJECT_AREA_THRESH * self.output.scale
or y1 - y0 < 40 * self.output.scale
):
if y1 >= self.output.height - 5:
text_pos = (x1, y0)
else:
text_pos = (x0, y1)
height_ratio = (y1 - y0) / np.sqrt(self.output.height * self.output.width)
lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
font_size = (
np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2)
* 0.5
* self._default_font_size
)
self.draw_text(
labels[i],
text_pos,
color=lighter_color,
horizontal_alignment=horiz_align,
font_size=font_size,
)
# draw keypoints
if keypoints is not None:
for keypoints_per_instance in keypoints:
self.draw_and_connect_keypoints(keypoints_per_instance)
return self.output
def overlay_rotated_instances(self, boxes=None, labels=None, assigned_colors=None):
"""
Args:
boxes (ndarray): an Nx5 numpy array of
(x_center, y_center, width, height, angle_degrees) format
for the N objects in a single image.
labels (list[str]): the text to be displayed for each instance.
assigned_colors (list[matplotlib.colors]): a list of colors, where each color
corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
for full list of formats that the colors are accepted in.
Returns:
output (VisImage): image object with visualizations.
"""
num_instances = len(boxes)
if assigned_colors is None:
assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)]
if num_instances == 0:
return self.output
# Display in largest to smallest order to reduce occlusion.
if boxes is not None:
areas = boxes[:, 2] * boxes[:, 3]
sorted_idxs = np.argsort(-areas).tolist()
# Re-order overlapped instances in descending order.
boxes = boxes[sorted_idxs]
labels = [labels[k] for k in sorted_idxs] if labels is not None else None
colors = [assigned_colors[idx] for idx in sorted_idxs]
for i in range(num_instances):
self.draw_rotated_box_with_label(
boxes[i], edge_color=colors[i], label=labels[i] if labels is not None else None
)
return self.output
def draw_and_connect_keypoints(self, keypoints):
"""
Draws keypoints of an instance and follows the rules for keypoint connections
to draw lines between appropriate keypoints. This follows color heuristics for
line color.
Args:
keypoints (Tensor): a tensor of shape (K, 3), where K is the number of keypoints
and the last dimension corresponds to (x, y, probability).
Returns:
output (VisImage): image object with visualizations.
"""
visible = {}
keypoint_names = self.metadata.get("keypoint_names")
for idx, keypoint in enumerate(keypoints):
# draw keypoint
x, y, prob = keypoint
if prob > self.keypoint_threshold:
self.draw_circle((x, y), color=_RED)
if keypoint_names:
keypoint_name = keypoint_names[idx]
visible[keypoint_name] = (x, y)
if self.metadata.get("keypoint_connection_rules"):
for kp0, kp1, color in self.metadata.keypoint_connection_rules:
if kp0 in visible and kp1 in visible:
x0, y0 = visible[kp0]
x1, y1 = visible[kp1]
color = tuple(x / 255.0 for x in color)
self.draw_line([x0, x1], [y0, y1], color=color)
# draw lines from nose to mid-shoulder and mid-shoulder to mid-hip
# Note that this strategy is specific to person keypoints.
# For other keypoints, it should just do nothing
try:
ls_x, ls_y = visible["left_shoulder"]
rs_x, rs_y = visible["right_shoulder"]
mid_shoulder_x, mid_shoulder_y = (ls_x + rs_x) / 2, (ls_y + rs_y) / 2
except KeyError:
pass
else:
# draw line from nose to mid-shoulder
nose_x, nose_y = visible.get("nose", (None, None))
if nose_x is not None:
self.draw_line([nose_x, mid_shoulder_x], [nose_y, mid_shoulder_y], color=_RED)
try:
# draw line from mid-shoulder to mid-hip
lh_x, lh_y = visible["left_hip"]
rh_x, rh_y = visible["right_hip"]
except KeyError:
pass
else:
mid_hip_x, mid_hip_y = (lh_x + rh_x) / 2, (lh_y + rh_y) / 2
self.draw_line([mid_hip_x, mid_shoulder_x], [mid_hip_y, mid_shoulder_y], color=_RED)
return self.output
"""
Primitive drawing functions:
"""
def draw_text(
self,
text,
position,
*,
font_size=None,
color="g",
horizontal_alignment="center",
rotation=0,
):
"""
Args:
text (str): class label
position (tuple): a tuple of the x and y coordinates to place text on image.
font_size (int, optional): font of the text. If not provided, a font size
proportional to the image width is calculated and used.
color: color of the text. Refer to `matplotlib.colors` for full list
of formats that are accepted.
horizontal_alignment (str): see `matplotlib.text.Text`
rotation: rotation angle in degrees CCW
Returns:
output (VisImage): image object with text drawn.
"""
if not font_size:
font_size = self._default_font_size
# since the text background is dark, we don't want the text to be dark
color = np.maximum(list(mplc.to_rgb(color)), 0.2)
color[np.argmax(color)] = max(0.8, np.max(color))
x, y = position
self.output.ax.text(
x,
y,
text,
size=font_size * self.output.scale,
family="sans-serif",
bbox={"facecolor": "black", "alpha": 0.8, "pad": 0.7, "edgecolor": "none"},
verticalalignment="top",
horizontalalignment=horizontal_alignment,
color=color,
zorder=10,
rotation=rotation,
)
return self.output
def draw_box(self, box_coord, alpha=0.5, edge_color="g", line_style="-"):
"""
Args:
box_coord (tuple): a tuple containing x0, y0, x1, y1 coordinates, where x0 and y0
are the coordinates of the image's top left corner. x1 and y1 are the
coordinates of the image's bottom right corner.
alpha (float): blending efficient. Smaller values lead to more transparent masks.
edge_color: color of the outline of the box. Refer to `matplotlib.colors`
for full list of formats that are accepted.
line_style (string): the string to use to create the outline of the boxes.
Returns:
output (VisImage): image object with box drawn.
"""
x0, y0, x1, y1 = box_coord
width = x1 - x0
height = y1 - y0
linewidth = max(self._default_font_size / 4, 1)
self.output.ax.add_patch(
mpl.patches.Rectangle(
(x0, y0),
width,
height,
fill=False,
edgecolor=edge_color,
linewidth=linewidth * self.output.scale,
alpha=alpha,
linestyle=line_style,
)
)
return self.output
def draw_rotated_box_with_label(
self, rotated_box, alpha=0.5, edge_color="g", line_style="-", label=None
):
"""
Draw a rotated box with label on its top-left corner.
Args:
rotated_box (tuple): a tuple containing (cnt_x, cnt_y, w, h, angle),
where cnt_x and cnt_y are the center coordinates of the box.
w and h are the width and height of the box. angle represents how
many degrees the box is rotated CCW with regard to the 0-degree box.
alpha (float): blending efficient. Smaller values lead to more transparent masks.
edge_color: color of the outline of the box. Refer to `matplotlib.colors`
for full list of formats that are accepted.
line_style (string): the string to use to create the outline of the boxes.
label (string): label for rotated box. It will not be rendered when set to None.
Returns:
output (VisImage): image object with box drawn.
"""
cnt_x, cnt_y, w, h, angle = rotated_box
area = w * h
# use thinner lines when the box is small
linewidth = self._default_font_size / (
6 if area < _SMALL_OBJECT_AREA_THRESH * self.output.scale else 3
)
theta = angle * math.pi / 180.0
c = math.cos(theta)
s = math.sin(theta)
rect = [(-w / 2, h / 2), (-w / 2, -h / 2), (w / 2, -h / 2), (w / 2, h / 2)]
# x: left->right ; y: top->down
rotated_rect = [(s * yy + c * xx + cnt_x, c * yy - s * xx + cnt_y) for (xx, yy) in rect]
for k in range(4):
j = (k + 1) % 4
self.draw_line(
[rotated_rect[k][0], rotated_rect[j][0]],
[rotated_rect[k][1], rotated_rect[j][1]],
color=edge_color,
linestyle="--" if k == 1 else line_style,
linewidth=linewidth,
)
if label is not None:
text_pos = rotated_rect[1] # topleft corner
height_ratio = h / np.sqrt(self.output.height * self.output.width)
label_color = self._change_color_brightness(edge_color, brightness_factor=0.7)
font_size = (
np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2) * 0.5 * self._default_font_size
)
self.draw_text(label, text_pos, color=label_color, font_size=font_size, rotation=angle)
return self.output
def draw_circle(self, circle_coord, color, radius=3):
"""
Args:
circle_coord (list(int) or tuple(int)): contains the x and y coordinates
of the center of the circle.
color: color of the polygon. Refer to `matplotlib.colors` for a full list of
formats that are accepted.
radius (int): radius of the circle.
Returns:
output (VisImage): image object with box drawn.
"""
x, y = circle_coord
self.output.ax.add_patch(
mpl.patches.Circle(circle_coord, radius=radius, fill=True, color=color)
)
return self.output
def draw_line(self, x_data, y_data, color, linestyle="-", linewidth=None):
"""
Args:
x_data (list[int]): a list containing x values of all the points being drawn.
Length of list should match the length of y_data.
y_data (list[int]): a list containing y values of all the points being drawn.
Length of list should match the length of x_data.
color: color of the line. Refer to `matplotlib.colors` for a full list of
formats that are accepted.
linestyle: style of the line. Refer to `matplotlib.lines.Line2D`
for a full list of formats that are accepted.
linewidth (float or None): width of the line. When it's None,
a default value will be computed and used.
Returns:
output (VisImage): image object with line drawn.
"""
if linewidth is None:
linewidth = self._default_font_size / 3
linewidth = max(linewidth, 1)
self.output.ax.add_line(
mpl.lines.Line2D(
x_data,
y_data,
linewidth=linewidth * self.output.scale,
color=color,
linestyle=linestyle,
)
)
return self.output
def draw_binary_mask(
self, binary_mask, color=None, *, edge_color=None, text=None, alpha=0.5, area_threshold=0
):
"""
Args:
binary_mask (ndarray): numpy array of shape (H, W), where H is the image height and
W is the image width. Each value in the array is either a 0 or 1 value of uint8
type.
color: color of the mask. Refer to `matplotlib.colors` for a full list of
formats that are accepted. If None, will pick a random color.
edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
full list of formats that are accepted.
text (str): if None, will be drawn in the object's center of mass.
alpha (float): blending efficient. Smaller values lead to more transparent masks.
area_threshold (float): a connected component small than this will not be shown.
Returns:
output (VisImage): image object with mask drawn.
"""
if color is None:
color = random_color(rgb=True, maximum=1)
color = mplc.to_rgb(color)
has_valid_segment = False
binary_mask = binary_mask.astype("uint8") # opencv needs uint8
mask = GenericMask(binary_mask, self.output.height, self.output.width)
shape2d = (binary_mask.shape[0], binary_mask.shape[1])
if not mask.has_holes:
# draw polygons for regular masks
for segment in mask.polygons:
area = mask_util.area(mask_util.frPyObjects([segment], shape2d[0], shape2d[1]))
if area < (area_threshold or 0):
continue
has_valid_segment = True
segment = segment.reshape(-1, 2)
self.draw_polygon(segment, color=color, edge_color=edge_color, alpha=alpha)
else:
# TODO: Use Path/PathPatch to draw vector graphics:
# https://stackoverflow.com/questions/8919719/how-to-plot-a-complex-polygon
rgba = np.zeros(shape2d + (4,), dtype="float32")
rgba[:, :, :3] = color
rgba[:, :, 3] = (mask.mask == 1).astype("float32") * alpha
has_valid_segment = True
self.output.ax.imshow(rgba, extent=(0, self.output.width, self.output.height, 0))
if text is not None and has_valid_segment:
# TODO sometimes drawn on wrong objects. the heuristics here can improve.
lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
_num_cc, cc_labels, stats, centroids = cv2.connectedComponentsWithStats(binary_mask, 8)
largest_component_id = np.argmax(stats[1:, -1]) + 1
# draw text on the largest component, as well as other very large components.
for cid in range(1, _num_cc):
if cid == largest_component_id or stats[cid, -1] > _LARGE_MASK_AREA_THRESH:
# median is more stable than centroid
# center = centroids[largest_component_id]
center = np.median((cc_labels == cid).nonzero(), axis=1)[::-1]
self.draw_text(text, center, color=lighter_color)
return self.output
def draw_polygon(self, segment, color, edge_color=None, alpha=0.5):
"""
Args:
segment: numpy array of shape Nx2, containing all the points in the polygon.
color: color of the polygon. Refer to `matplotlib.colors` for a full list of
formats that are accepted.
edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
full list of formats that are accepted. If not provided, a darker shade
of the polygon color will be used instead.
alpha (float): blending efficient. Smaller values lead to more transparent masks.
Returns:
output (VisImage): image object with polygon drawn.
"""
if edge_color is None:
# make edge color darker than the polygon color
if alpha > 0.8:
edge_color = self._change_color_brightness(color, brightness_factor=-0.7)
else:
edge_color = color
edge_color = mplc.to_rgb(edge_color) + (1,)
polygon = mpl.patches.Polygon(
segment,
fill=True,
facecolor=mplc.to_rgb(color) + (alpha,),
edgecolor=edge_color,
linewidth=max(self._default_font_size // 15 * self.output.scale, 1),
)
self.output.ax.add_patch(polygon)
return self.output
"""
Internal methods:
"""
def _jitter(self, color):
"""
Randomly modifies given color to produce a slightly different color than the color given.
Args:
color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color
picked. The values in the list are in the [0.0, 1.0] range.
Returns:
jittered_color (tuple[double]): a tuple of 3 elements, containing the RGB values of the
color after being jittered. The values in the list are in the [0.0, 1.0] range.
"""
color = mplc.to_rgb(color)
vec = np.random.rand(3)
# better to do it in another color space
vec = vec / np.linalg.norm(vec) * 0.5
res = np.clip(vec + color, 0, 1)
return tuple(res)
def _create_grayscale_image(self, mask=None):
"""
Create a grayscale version of the original image.
The colors in masked area, if given, will be kept.
"""
img_bw = self.img.astype("f4").mean(axis=2)
img_bw = np.stack([img_bw] * 3, axis=2)
if mask is not None:
img_bw[mask] = self.img[mask]
return img_bw
def _change_color_brightness(self, color, brightness_factor):
"""
Depending on the brightness_factor, gives a lighter or darker color i.e. a color with
less or more saturation than the original color.
Args:
color: color of the polygon. Refer to `matplotlib.colors` for a full list of
formats that are accepted.
brightness_factor (float): a value in [-1.0, 1.0] range. A lightness factor of
0 will correspond to no change, a factor in [-1.0, 0) range will result in
a darker color and a factor in (0, 1.0] range will result in a lighter color.
Returns:
modified_color (tuple[double]): a tuple containing the RGB values of the
modified color. Each value in the tuple is in the [0.0, 1.0] range.
"""
assert brightness_factor >= -1.0 and brightness_factor <= 1.0
color = mplc.to_rgb(color)
polygon_color = colorsys.rgb_to_hls(*mplc.to_rgb(color))
modified_lightness = polygon_color[1] + (brightness_factor * polygon_color[1])
modified_lightness = 0.0 if modified_lightness < 0.0 else modified_lightness
modified_lightness = 1.0 if modified_lightness > 1.0 else modified_lightness
modified_color = colorsys.hls_to_rgb(polygon_color[0], modified_lightness, polygon_color[2])
return modified_color
def _convert_boxes(self, boxes):
"""
Convert different format of boxes to an NxB array, where B = 4 or 5 is the box dimension.
"""
if isinstance(boxes, Boxes) or isinstance(boxes, RotatedBoxes):
return boxes.tensor.detach().numpy()
else:
return np.asarray(boxes)
def _convert_masks(self, masks_or_polygons):
"""
Convert different format of masks or polygons to a tuple of masks and polygons.
Returns:
list[GenericMask]:
"""
m = masks_or_polygons
if isinstance(m, PolygonMasks):
m = m.polygons
if isinstance(m, BitMasks):
m = m.tensor.numpy()
if isinstance(m, torch.Tensor):
m = m.numpy()
ret = []
for x in m:
if isinstance(x, GenericMask):
ret.append(x)
else:
ret.append(GenericMask(x, self.output.height, self.output.width))
return ret
def _convert_keypoints(self, keypoints):
if isinstance(keypoints, Keypoints):
keypoints = keypoints.tensor
keypoints = np.asarray(keypoints)
return keypoints
def get_output(self):
"""
Returns:
output (VisImage): the image output containing the visualizations added
to the image.
"""
return self.output
from tqdm import tqdm
from ultralytics import YOLO
class YOLOv8MFDModel(object):
def __init__(self, weight, device="cpu"):
self.mfd_model = YOLO(weight)
self.device = device
def predict(self, image):
mfd_res = self.mfd_model.predict(
image, imgsz=1888, conf=0.25, iou=0.45, verbose=False, device=self.device
)[0]
return mfd_res
def batch_predict(self, images: list, batch_size: int) -> list:
images_mfd_res = []
# for index in range(0, len(images), batch_size):
for index in tqdm(range(0, len(images), batch_size), desc="MFD Predict"):
mfd_res = [
image_res.cpu()
for image_res in self.mfd_model.predict(
images[index : index + batch_size],
imgsz=1888,
conf=0.25,
iou=0.45,
verbose=False,
device=self.device,
)
]
for image_res in mfd_res:
images_mfd_res.append(image_res)
return images_mfd_res
import torch
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
class MathDataset(Dataset):
def __init__(self, image_paths, transform=None):
self.image_paths = image_paths
self.transform = transform
def __len__(self):
return len(self.image_paths)
def __getitem__(self, idx):
raw_image = self.image_paths[idx]
if self.transform:
image = self.transform(raw_image)
return image
class UnimernetModel(object):
def __init__(self, weight_dir, cfg_path, _device_="cpu"):
from .unimernet_hf import UnimernetModel
if _device_.startswith("mps"):
self.model = UnimernetModel.from_pretrained(weight_dir, attn_implementation="eager")
else:
self.model = UnimernetModel.from_pretrained(weight_dir)
self.device = _device_
self.model.to(_device_)
if not _device_.startswith("cpu"):
self.model = self.model.to(dtype=torch.float16)
self.model.eval()
def predict(self, mfd_res, image):
formula_list = []
mf_image_list = []
for xyxy, conf, cla in zip(
mfd_res.boxes.xyxy.cpu(), mfd_res.boxes.conf.cpu(), mfd_res.boxes.cls.cpu()
):
xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
new_item = {
"category_id": 13 + int(cla.item()),
"poly": [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax],
"score": round(float(conf.item()), 2),
"latex": "",
}
formula_list.append(new_item)
bbox_img = image[ymin:ymax, xmin:xmax]
mf_image_list.append(bbox_img)
dataset = MathDataset(mf_image_list, transform=self.model.transform)
dataloader = DataLoader(dataset, batch_size=32, num_workers=0)
mfr_res = []
for mf_img in dataloader:
mf_img = mf_img.to(dtype=self.model.dtype)
mf_img = mf_img.to(self.device)
with torch.no_grad():
output = self.model.generate({"image": mf_img})
mfr_res.extend(output["fixed_str"])
for res, latex in zip(formula_list, mfr_res):
res["latex"] = latex
return formula_list
def batch_predict(self, images_mfd_res: list, images: list, batch_size: int = 64) -> list:
images_formula_list = []
mf_image_list = []
backfill_list = []
image_info = [] # Store (area, original_index, image) tuples
# Collect images with their original indices
for image_index in range(len(images_mfd_res)):
mfd_res = images_mfd_res[image_index]
np_array_image = images[image_index]
formula_list = []
for idx, (xyxy, conf, cla) in enumerate(zip(
mfd_res.boxes.xyxy, mfd_res.boxes.conf, mfd_res.boxes.cls
)):
xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
new_item = {
"category_id": 13 + int(cla.item()),
"poly": [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax],
"score": round(float(conf.item()), 2),
"latex": "",
}
formula_list.append(new_item)
bbox_img = np_array_image[ymin:ymax, xmin:xmax]
area = (xmax - xmin) * (ymax - ymin)
curr_idx = len(mf_image_list)
image_info.append((area, curr_idx, bbox_img))
mf_image_list.append(bbox_img)
images_formula_list.append(formula_list)
backfill_list += formula_list
# Stable sort by area
image_info.sort(key=lambda x: x[0]) # sort by area
sorted_indices = [x[1] for x in image_info]
sorted_images = [x[2] for x in image_info]
# Create mapping for results
index_mapping = {new_idx: old_idx for new_idx, old_idx in enumerate(sorted_indices)}
# Create dataset with sorted images
dataset = MathDataset(sorted_images, transform=self.model.transform)
dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=0)
# Process batches and store results
mfr_res = []
# for mf_img in dataloader:
with tqdm(total=len(sorted_images), desc="MFR Predict") as pbar:
for index, mf_img in enumerate(dataloader):
mf_img = mf_img.to(dtype=self.model.dtype)
mf_img = mf_img.to(self.device)
with torch.no_grad():
output = self.model.generate({"image": mf_img})
mfr_res.extend(output["fixed_str"])
# 更新进度条,每次增加batch_size,但要注意最后一个batch可能不足batch_size
current_batch_size = min(batch_size, len(sorted_images) - index * batch_size)
pbar.update(current_batch_size)
# Restore original order
unsorted_results = [""] * len(mfr_res)
for new_idx, latex in enumerate(mfr_res):
original_idx = index_mapping[new_idx]
unsorted_results[original_idx] = latex
# Fill results back
for res, latex in zip(backfill_list, unsorted_results):
res["latex"] = latex
return images_formula_list
from .unimer_swin import UnimerSwinConfig, UnimerSwinModel, UnimerSwinImageProcessor
from .unimer_mbart import UnimerMBartConfig, UnimerMBartModel, UnimerMBartForCausalLM
from .modeling_unimernet import UnimernetModel
__all__ = [
"UnimerSwinConfig",
"UnimerSwinModel",
"UnimerSwinImageProcessor",
"UnimerMBartConfig",
"UnimerMBartModel",
"UnimerMBartForCausalLM",
"UnimernetModel",
]
import os
import re
import warnings
from typing import Optional
import torch
from ftfy import fix_text
from loguru import logger
from transformers import AutoConfig, AutoModel, AutoModelForCausalLM, AutoTokenizer, PretrainedConfig, PreTrainedModel
from transformers import VisionEncoderDecoderConfig, VisionEncoderDecoderModel
from transformers.models.vision_encoder_decoder.modeling_vision_encoder_decoder import logger as base_model_logger
from .unimer_swin import UnimerSwinConfig, UnimerSwinModel, UnimerSwinImageProcessor
from .unimer_mbart import UnimerMBartConfig, UnimerMBartForCausalLM
AutoConfig.register(UnimerSwinConfig.model_type, UnimerSwinConfig)
AutoConfig.register(UnimerMBartConfig.model_type, UnimerMBartConfig)
AutoModel.register(UnimerSwinConfig, UnimerSwinModel)
AutoModelForCausalLM.register(UnimerMBartConfig, UnimerMBartForCausalLM)
# TODO: rewrite tokenizer
class TokenizerWrapper:
def __init__(self, tokenizer):
self.tokenizer = tokenizer
self.pad_token_id = self.tokenizer.pad_token_id
self.bos_token_id = self.tokenizer.bos_token_id
self.eos_token_id = self.tokenizer.eos_token_id
def __len__(self):
return len(self.tokenizer)
def tokenize(self, text, **kwargs):
return self.tokenizer(
text,
return_token_type_ids=False,
return_tensors="pt",
padding="longest",
truncation=True,
**kwargs,
)
def token2str(self, tokens) -> list:
generated_text = self.tokenizer.batch_decode(tokens, skip_special_tokens=True)
generated_text = [fix_text(text) for text in generated_text]
return generated_text
def detokenize(self, tokens):
toks = [self.tokenizer.convert_ids_to_tokens(tok) for tok in tokens]
for b in range(len(toks)):
for i in reversed(range(len(toks[b]))):
if toks[b][i] is None:
toks[b][i] = ''
toks[b][i] = toks[b][i].replace('Ġ', ' ').strip()
if toks[b][i] in ([self.tokenizer.bos_token, self.tokenizer.eos_token, self.tokenizer.pad_token]):
del toks[b][i]
return toks
LEFT_PATTERN = re.compile(r'(\\left)(\S*)')
RIGHT_PATTERN = re.compile(r'(\\right)(\S*)')
LEFT_COUNT_PATTERN = re.compile(r'\\left(?![a-zA-Z])')
RIGHT_COUNT_PATTERN = re.compile(r'\\right(?![a-zA-Z])')
LEFT_RIGHT_REMOVE_PATTERN = re.compile(r'\\left\.?|\\right\.?')
def fix_latex_left_right(s):
"""
修复LaTeX中的\\left和\\right命令
1. 确保它们后面跟有效分隔符
2. 平衡\\left和\\right的数量
"""
# 白名单分隔符
valid_delims_list = [r'(', r')', r'[', r']', r'{', r'}', r'/', r'|',
r'\{', r'\}', r'\lceil', r'\rceil', r'\lfloor',
r'\rfloor', r'\backslash', r'\uparrow', r'\downarrow',
r'\Uparrow', r'\Downarrow', r'\|', r'\.']
# 为\left后缺失有效分隔符的情况添加点
def fix_delim(match, is_left=True):
cmd = match.group(1) # \left 或 \right
rest = match.group(2) if len(match.groups()) > 1 else ""
if not rest or rest not in valid_delims_list:
return cmd + "."
return match.group(0)
# 使用更精确的模式匹配\left和\right命令
# 确保它们是独立的命令,不是其他命令的一部分
# 使用预编译正则和统一回调函数
s = LEFT_PATTERN.sub(lambda m: fix_delim(m, True), s)
s = RIGHT_PATTERN.sub(lambda m: fix_delim(m, False), s)
# 更精确地计算\left和\right的数量
left_count = len(LEFT_COUNT_PATTERN.findall(s)) # 不匹配\lefteqn等
right_count = len(RIGHT_COUNT_PATTERN.findall(s)) # 不匹配\rightarrow等
if left_count == right_count:
# 如果数量相等,检查是否在同一组
return fix_left_right_pairs(s)
else:
# 如果数量不等,移除所有\left和\right
# logger.debug(f"latex:{s}")
# logger.warning(f"left_count: {left_count}, right_count: {right_count}")
return LEFT_RIGHT_REMOVE_PATTERN.sub('', s)
def fix_left_right_pairs(latex_formula):
"""
检测并修复LaTeX公式中\\left和\\right不在同一组的情况
Args:
latex_formula (str): 输入的LaTeX公式
Returns:
str: 修复后的LaTeX公式
"""
# 用于跟踪花括号嵌套层级
brace_stack = []
# 用于存储\left信息: (位置, 深度, 分隔符)
left_stack = []
# 存储需要调整的\right信息: (开始位置, 结束位置, 目标位置)
adjustments = []
i = 0
while i < len(latex_formula):
# 检查是否是转义字符
if i > 0 and latex_formula[i - 1] == '\\':
backslash_count = 0
j = i - 1
while j >= 0 and latex_formula[j] == '\\':
backslash_count += 1
j -= 1
if backslash_count % 2 == 1:
i += 1
continue
# 检测\left命令
if i + 5 < len(latex_formula) and latex_formula[i:i + 5] == "\\left" and i + 5 < len(latex_formula):
delimiter = latex_formula[i + 5]
left_stack.append((i, len(brace_stack), delimiter))
i += 6 # 跳过\left和分隔符
continue
# 检测\right命令
elif i + 6 < len(latex_formula) and latex_formula[i:i + 6] == "\\right" and i + 6 < len(latex_formula):
delimiter = latex_formula[i + 6]
if left_stack:
left_pos, left_depth, left_delim = left_stack.pop()
# 如果\left和\right不在同一花括号深度
if left_depth != len(brace_stack):
# 找到\left所在花括号组的结束位置
target_pos = find_group_end(latex_formula, left_pos, left_depth)
if target_pos != -1:
# 记录需要移动的\right
adjustments.append((i, i + 7, target_pos))
i += 7 # 跳过\right和分隔符
continue
# 处理花括号
if latex_formula[i] == '{':
brace_stack.append(i)
elif latex_formula[i] == '}':
if brace_stack:
brace_stack.pop()
i += 1
# 应用调整,从后向前处理以避免索引变化
if not adjustments:
return latex_formula
result = list(latex_formula)
adjustments.sort(reverse=True, key=lambda x: x[0])
for start, end, target in adjustments:
# 提取\right部分
right_part = result[start:end]
# 从原位置删除
del result[start:end]
# 在目标位置插入
result.insert(target, ''.join(right_part))
return ''.join(result)
def find_group_end(text, pos, depth):
"""查找特定深度的花括号组的结束位置"""
current_depth = depth
i = pos
while i < len(text):
if text[i] == '{' and (i == 0 or not is_escaped(text, i)):
current_depth += 1
elif text[i] == '}' and (i == 0 or not is_escaped(text, i)):
current_depth -= 1
if current_depth < depth:
return i
i += 1
return -1 # 未找到对应结束位置
def is_escaped(text, pos):
"""检查字符是否被转义"""
backslash_count = 0
j = pos - 1
while j >= 0 and text[j] == '\\':
backslash_count += 1
j -= 1
return backslash_count % 2 == 1
def fix_unbalanced_braces(latex_formula):
"""
检测LaTeX公式中的花括号是否闭合,并删除无法配对的花括号
Args:
latex_formula (str): 输入的LaTeX公式
Returns:
str: 删除无法配对的花括号后的LaTeX公式
"""
stack = [] # 存储左括号的索引
unmatched = set() # 存储不匹配括号的索引
i = 0
while i < len(latex_formula):
# 检查是否是转义的花括号
if latex_formula[i] in ['{', '}']:
# 计算前面连续的反斜杠数量
backslash_count = 0
j = i - 1
while j >= 0 and latex_formula[j] == '\\':
backslash_count += 1
j -= 1
# 如果前面有奇数个反斜杠,则该花括号是转义的,不参与匹配
if backslash_count % 2 == 1:
i += 1
continue
# 否则,该花括号参与匹配
if latex_formula[i] == '{':
stack.append(i)
else: # latex_formula[i] == '}'
if stack: # 有对应的左括号
stack.pop()
else: # 没有对应的左括号
unmatched.add(i)
i += 1
# 所有未匹配的左括号
unmatched.update(stack)
# 构建新字符串,删除不匹配的括号
return ''.join(char for i, char in enumerate(latex_formula) if i not in unmatched)
def process_latex(input_string):
"""
处理LaTeX公式中的反斜杠:
1. 如果\后跟特殊字符(#$%&~_^\\{})或空格,保持不变
2. 如果\后跟两个小写字母,保持不变
3. 其他情况,在\后添加空格
Args:
input_string (str): 输入的LaTeX公式
Returns:
str: 处理后的LaTeX公式
"""
def replace_func(match):
# 获取\后面的字符
next_char = match.group(1)
# 如果是特殊字符或空格,保持不变
if next_char in "#$%&~_^|\\{} \t\n\r\v\f":
return match.group(0)
# 如果是字母,检查下一个字符
if 'a' <= next_char <= 'z' or 'A' <= next_char <= 'Z':
pos = match.start() + 2 # \x后的位置
if pos < len(input_string) and ('a' <= input_string[pos] <= 'z' or 'A' <= input_string[pos] <= 'Z'):
# 下一个字符也是字母,保持不变
return match.group(0)
# 其他情况,在\后添加空格
return '\\' + ' ' + next_char
# 匹配\后面跟一个字符的情况
pattern = r'\\(.)'
return re.sub(pattern, replace_func, input_string)
# 常见的在KaTeX/MathJax中可用的数学环境
ENV_TYPES = ['array', 'matrix', 'pmatrix', 'bmatrix', 'vmatrix',
'Bmatrix', 'Vmatrix', 'cases', 'aligned', 'gathered']
ENV_BEGIN_PATTERNS = {env: re.compile(r'\\begin\{' + env + r'\}') for env in ENV_TYPES}
ENV_END_PATTERNS = {env: re.compile(r'\\end\{' + env + r'\}') for env in ENV_TYPES}
ENV_FORMAT_PATTERNS = {env: re.compile(r'\\begin\{' + env + r'\}\{([^}]*)\}') for env in ENV_TYPES}
def fix_latex_environments(s):
"""
检测LaTeX中环境(如array)的\\begin和\\end是否匹配
1. 如果缺少\\begin标签则在开头添加
2. 如果缺少\\end标签则在末尾添加
"""
for env in ENV_TYPES:
begin_count = len(ENV_BEGIN_PATTERNS[env].findall(s))
end_count = len(ENV_END_PATTERNS[env].findall(s))
if begin_count != end_count:
if end_count > begin_count:
format_match = ENV_FORMAT_PATTERNS[env].search(s)
default_format = '{c}' if env == 'array' else ''
format_str = '{' + format_match.group(1) + '}' if format_match else default_format
missing_count = end_count - begin_count
begin_command = '\\begin{' + env + '}' + format_str + ' '
s = begin_command * missing_count + s
else:
missing_count = begin_count - end_count
s = s + (' \\end{' + env + '}') * missing_count
return s
UP_PATTERN = re.compile(r'\\up([a-zA-Z]+)')
COMMANDS_TO_REMOVE_PATTERN = re.compile(
r'\\(?:lefteqn|boldmath|ensuremath|centering|textsubscript|sides|textsl|textcent|emph|protect|null)')
REPLACEMENTS_PATTERNS = {
re.compile(r'\\underbar'): r'\\underline',
re.compile(r'\\Bar'): r'\\hat',
re.compile(r'\\Hat'): r'\\hat',
re.compile(r'\\Tilde'): r'\\tilde',
re.compile(r'\\slash'): r'/',
re.compile(r'\\textperthousand'): r'‰',
re.compile(r'\\sun'): r'☉',
re.compile(r'\\textunderscore'): r'\\_',
re.compile(r'\\fint'): r'⨏',
re.compile(r'\\up '): r'\\ ',
re.compile(r'\\vline = '): r'\\models ',
re.compile(r'\\vDash '): r'\\models ',
re.compile(r'\\sq \\sqcup '): r'\\square ',
}
QQUAD_PATTERN = re.compile(r'\\qquad(?!\s)')
def latex_rm_whitespace(s: str):
"""Remove unnecessary whitespace from LaTeX code."""
s = fix_unbalanced_braces(s)
s = fix_latex_left_right(s)
s = fix_latex_environments(s)
# 使用预编译的正则表达式
s = UP_PATTERN.sub(
lambda m: m.group(0) if m.group(1) in ["arrow", "downarrow", "lus", "silon"] else f"\\{m.group(1)}", s
)
s = COMMANDS_TO_REMOVE_PATTERN.sub('', s)
# 应用所有替换
for pattern, replacement in REPLACEMENTS_PATTERNS.items():
s = pattern.sub(replacement, s)
# 处理LaTeX中的反斜杠和空格
s = process_latex(s)
# \qquad后补空格
s = QQUAD_PATTERN.sub(r'\\qquad ', s)
return s
class UnimernetModel(VisionEncoderDecoderModel):
def __init__(
self,
config: Optional[PretrainedConfig] = None,
encoder: Optional[PreTrainedModel] = None,
decoder: Optional[PreTrainedModel] = None,
):
# VisionEncoderDecoderModel's checking log has bug, disable for temp.
base_model_logger.disabled = True
try:
super().__init__(config, encoder, decoder)
finally:
base_model_logger.disabled = False
if not config or not hasattr(config, "_name_or_path"):
raise RuntimeError("config._name_or_path is required by UnimernetModel.")
model_path = config._name_or_path
self.transform = UnimerSwinImageProcessor()
self.tokenizer = TokenizerWrapper(AutoTokenizer.from_pretrained(model_path))
self._post_check()
def _post_check(self):
tokenizer = self.tokenizer
if tokenizer.tokenizer.model_max_length != self.config.decoder.max_position_embeddings:
warnings.warn(
f"decoder.max_position_embeddings={self.config.decoder.max_position_embeddings}," +
f" but tokenizer.model_max_length={tokenizer.tokenizer.model_max_length}, will set" +
f" tokenizer.model_max_length to {self.config.decoder.max_position_embeddings}.")
tokenizer.tokenizer.model_max_length = self.config.decoder.max_position_embeddings
assert self.config.decoder.vocab_size == len(tokenizer)
assert self.config.decoder_start_token_id == tokenizer.bos_token_id
assert self.config.pad_token_id == tokenizer.pad_token_id
@classmethod
def from_checkpoint(cls, model_path: str, model_filename: str = "pytorch_model.pth", state_dict_strip_prefix="model.model."):
config = VisionEncoderDecoderConfig.from_pretrained(model_path)
config._name_or_path = model_path
config.encoder = UnimerSwinConfig(**vars(config.encoder))
config.decoder = UnimerMBartConfig(**vars(config.decoder))
encoder = UnimerSwinModel(config.encoder)
decoder = UnimerMBartForCausalLM(config.decoder)
model = cls(config, encoder, decoder)
# load model weights
model_file_path = os.path.join(model_path, model_filename)
checkpoint = torch.load(model_file_path, map_location="cpu", weights_only=True)
state_dict = checkpoint["model"] if "model" in checkpoint else checkpoint
if not state_dict:
raise RuntimeError("state_dict is empty.")
if state_dict_strip_prefix:
state_dict = {
k[len(state_dict_strip_prefix):] if k.startswith(state_dict_strip_prefix) else k: v
for k, v in state_dict.items()
}
missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
if len(unexpected_keys) > 0:
warnings.warn("Unexpected key(s) in state_dict: {}.".format(", ".join(f'"{k}"' for k in unexpected_keys)))
if len(missing_keys) > 0:
raise RuntimeError("Missing key(s) in state_dict: {}.".format(", ".join(f'"{k}"' for k in missing_keys)))
return model
def forward_bak(self, samples):
pixel_values, text = samples["image"], samples["text_input"]
text_inputs = self.tokenizer.tokenize(text).to(pixel_values.device)
decoder_input_ids, decoder_attention_mask = text_inputs["input_ids"], text_inputs["attention_mask"]
num_channels = pixel_values.shape[1]
if num_channels == 1:
pixel_values = pixel_values.repeat(1, 3, 1, 1)
labels = decoder_input_ids * 1
labels = labels.masked_fill(labels == self.tokenizer.pad_token_id, -100)
loss = self.model(
pixel_values=pixel_values,
decoder_input_ids=decoder_input_ids[:, :-1],
decoder_attention_mask=decoder_attention_mask[:, :-1],
labels=labels[:, 1:],
).loss
return {"loss": loss}
def generate(self, samples, do_sample: bool = False, temperature: float = 0.2, top_p: float = 0.95):
pixel_values = samples["image"]
num_channels = pixel_values.shape[1]
if num_channels == 1:
pixel_values = pixel_values.repeat(1, 3, 1, 1)
kwargs = {}
if do_sample:
kwargs["temperature"] = temperature
kwargs["top_p"] = top_p
outputs = super().generate(
pixel_values=pixel_values,
max_new_tokens=self.tokenizer.tokenizer.model_max_length, # required
decoder_start_token_id=self.tokenizer.tokenizer.bos_token_id,
do_sample=do_sample,
**kwargs,
)
outputs = outputs[:, 1:].cpu().numpy()
pred_tokens = self.tokenizer.detokenize(outputs)
pred_str = self.tokenizer.token2str(outputs)
fixed_str = [latex_rm_whitespace(s) for s in pred_str]
return {"pred_ids": outputs, "pred_tokens": pred_tokens, "pred_str": pred_str, "fixed_str": fixed_str}
from .configuration_unimer_mbart import UnimerMBartConfig
from .modeling_unimer_mbart import UnimerMBartModel, UnimerMBartForCausalLM
__all__ = [
"UnimerMBartConfig",
"UnimerMBartModel",
"UnimerMBartForCausalLM",
]
# coding=utf-8
# Copyright 2021, The Facebook AI Research Team and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""UnimerMBART model configuration"""
from transformers.configuration_utils import PretrainedConfig
from transformers.utils import logging
logger = logging.get_logger(__name__)
class UnimerMBartConfig(PretrainedConfig):
r"""
This is the configuration class to store the configuration of a [`MBartModel`]. It is used to instantiate an MBART
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the MBART
[facebook/mbart-large-cc25](https://huggingface.co/facebook/mbart-large-cc25) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 50265):
Vocabulary size of the MBART model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`MBartModel`] or [`TFMBartModel`].
d_model (`int`, *optional*, defaults to 1024):
Dimensionality of the layers and the pooler layer.
qk_squeeze (`int`, *optional*, defaults to 2):
Squeeze ratio for query/key's output dimension. See the [UniMERNet paper](https://arxiv.org/abs/2404.15254).
Squeeze Attention maps the query and key to a lower-dimensional space without excessive loss of information,
thereby accelerating the computation of attention.
encoder_layers (`int`, *optional*, defaults to 12):
Number of encoder layers.
decoder_layers (`int`, *optional*, defaults to 12):
Number of decoder layers.
encoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
decoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer decoder.
decoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
encoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"silu"` and `"gelu_new"` are supported.
dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer.
classifier_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for classifier.
max_position_embeddings (`int`, *optional*, defaults to 1024):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
init_std (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
encoder_layerdrop (`float`, *optional*, defaults to 0.0):
The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
for more details.
decoder_layerdrop (`float`, *optional*, defaults to 0.0):
The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
for more details.
scale_embedding (`bool`, *optional*, defaults to `False`):
Scale embeddings by diving by sqrt(d_model).
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models)
forced_eos_token_id (`int`, *optional*, defaults to 2):
The id of the token to force as the last generated token when `max_length` is reached. Usually set to
`eos_token_id`.
Example:
```python
>>> from transformers import MBartConfig, MBartModel
>>> # Initializing a MBART facebook/mbart-large-cc25 style configuration
>>> configuration = MBartConfig()
>>> # Initializing a model (with random weights) from the facebook/mbart-large-cc25 style configuration
>>> model = MBartModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type = "unimer-mbart"
keys_to_ignore_at_inference = ["past_key_values"]
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
def __init__(
self,
vocab_size=50265,
max_position_embeddings=1024,
encoder_layers=12,
encoder_ffn_dim=4096,
encoder_attention_heads=16,
decoder_layers=12,
decoder_ffn_dim=4096,
decoder_attention_heads=16,
encoder_layerdrop=0.0,
decoder_layerdrop=0.0,
use_cache=True,
is_encoder_decoder=True,
activation_function="gelu",
d_model=1024,
qk_squeeze=2,
dropout=0.1,
attention_dropout=0.0,
activation_dropout=0.0,
init_std=0.02,
classifier_dropout=0.0,
scale_embedding=False,
pad_token_id=1,
bos_token_id=0,
eos_token_id=2,
forced_eos_token_id=2,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.d_model = d_model
self.qk_squeeze = qk_squeeze
self.encoder_ffn_dim = encoder_ffn_dim
self.encoder_layers = encoder_layers
self.encoder_attention_heads = encoder_attention_heads
self.decoder_ffn_dim = decoder_ffn_dim
self.decoder_layers = decoder_layers
self.decoder_attention_heads = decoder_attention_heads
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation_dropout = activation_dropout
self.activation_function = activation_function
self.init_std = init_std
self.encoder_layerdrop = encoder_layerdrop
self.decoder_layerdrop = decoder_layerdrop
self.classifier_dropout = classifier_dropout
self.use_cache = use_cache
self.num_hidden_layers = encoder_layers
self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True
super().__init__(
pad_token_id=pad_token_id,
bos_token_id=bos_token_id,
eos_token_id=eos_token_id,
is_encoder_decoder=is_encoder_decoder,
forced_eos_token_id=forced_eos_token_id,
**kwargs,
)
# coding=utf-8
# Copyright 2021, The Facebook AI Research Team and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""PyTorch UnimerMBART model."""
import copy
import math
from dataclasses import dataclass
from typing import List, Optional, Tuple, Union
import torch
import torch.nn.functional as F
import torch.utils.checkpoint
from torch import nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
from transformers.activations import ACT2FN
from transformers.modeling_attn_mask_utils import (
_prepare_4d_attention_mask,
_prepare_4d_attention_mask_for_sdpa,
_prepare_4d_causal_attention_mask,
_prepare_4d_causal_attention_mask_for_sdpa,
)
from transformers.modeling_outputs import (
BaseModelOutput,
BaseModelOutputWithPastAndCrossAttentions,
CausalLMOutputWithCrossAttentions,
Seq2SeqLMOutput,
Seq2SeqModelOutput,
Seq2SeqQuestionAnsweringModelOutput,
Seq2SeqSequenceClassifierOutput,
)
from transformers import GenerationMixin, PreTrainedModel
from transformers.utils import (
add_code_sample_docstrings,
add_end_docstrings,
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_flash_attn_2_available,
is_flash_attn_greater_or_equal_2_10,
logging,
replace_return_docstrings,
)
from .configuration_unimer_mbart import UnimerMBartConfig
if is_flash_attn_2_available():
from flash_attn import flash_attn_func, flash_attn_varlen_func
from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "facebook/mbart-large-cc25"
_CONFIG_FOR_DOC = "MBartConfig"
# Base model docstring
_EXPECTED_OUTPUT_SHAPE = [1, 8, 1024]
# Copied from transformers.models.llama.modeling_llama._get_unpad_data
def _get_unpad_data(attention_mask):
seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
max_seqlen_in_batch = seqlens_in_batch.max().item()
cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
return (
indices,
cu_seqlens,
max_seqlen_in_batch,
)
def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int):
"""
Shift input ids one token to the right, and wrap the last non pad token (the <LID> token) Note that MBart does not
have a single `decoder_start_token_id` in contrast to other Bart-like models.
"""
prev_output_tokens = input_ids.clone()
if pad_token_id is None:
raise ValueError("self.model.config.pad_token_id has to be defined.")
# replace possible -100 values in labels by `pad_token_id`
prev_output_tokens.masked_fill_(prev_output_tokens == -100, pad_token_id)
index_of_eos = (prev_output_tokens.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)
decoder_start_tokens = prev_output_tokens.gather(1, index_of_eos).squeeze()
prev_output_tokens[:, 1:] = prev_output_tokens[:, :-1].clone()
prev_output_tokens[:, 0] = decoder_start_tokens
return prev_output_tokens
@dataclass
class CausalLMOutputWithCrossAttentionsAndCounting(CausalLMOutputWithCrossAttentions):
"""
Base class for causal language model (or autoregressive) outputs.
Args:
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
heads.
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
sequence_length)`.
Cross attentions weights after the attention softmax, used to compute the weighted average in the
cross-attention heads.
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `torch.FloatTensor` tuples of length `config.n_layers`, with each tuple containing the cached key,
value states of the self-attention and the cross-attention layers if model is used in encoder-decoder
setting. Only relevant if `config.is_decoder = True`.
Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
`past_key_values` input) to speed up sequential decoding.
counting:
Counting
"""
counting: Optional[torch.FloatTensor] = None
# Copied from transformers.models.bart.modeling_bart.BartLearnedPositionalEmbedding with Bart->MBart
class UnimerMBartLearnedPositionalEmbedding(nn.Embedding):
"""
This module learns positional embeddings up to a fixed maximum size.
"""
def __init__(self, num_embeddings: int, embedding_dim: int):
# MBart is set up so that if padding_idx is specified then offset the embedding ids by 2
# and adjust num_embeddings appropriately. Other models don't have this hack
self.offset = 2
super().__init__(num_embeddings + self.offset, embedding_dim)
def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
"""`input_ids' shape is expected to be [bsz x seqlen]."""
bsz, seq_len = input_ids.shape[:2]
positions = torch.arange(
past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
).expand(bsz, -1)
return super().forward(positions + self.offset)
# Copied from transformers.models.bart.modeling_bart.BartScaledWordEmbedding with Bart->MBart
class UnimerMBartScaledWordEmbedding(nn.Embedding):
"""
This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
"""
def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: Optional[float] = 1.0):
super().__init__(num_embeddings, embedding_dim, padding_idx)
self.embed_scale = embed_scale
def forward(self, input_ids: torch.Tensor):
return super().forward(input_ids) * self.embed_scale
# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->MBart
class UnimerMBartAttention(nn.Module):
"""Multi-headed attention from 'Attention Is All You Need' paper, with qk_squeeze"""
def __init__(
self,
embed_dim: int,
num_heads: int,
dropout: float = 0.0,
is_decoder: bool = False,
bias: bool = True,
is_causal: bool = False,
*,
config: UnimerMBartConfig,
):
super().__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
self.config = config
if (self.head_dim * num_heads) != self.embed_dim:
raise ValueError(
f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
f" and `num_heads`: {num_heads})."
)
self.squeeze_dim = embed_dim // config.qk_squeeze
self.squeeze_head_dim = self.squeeze_dim // num_heads
self.scaling = self.squeeze_head_dim**-0.5
self.is_decoder = is_decoder
self.is_causal = is_causal
self.q_proj = nn.Linear(embed_dim, self.squeeze_dim, bias=bias)
self.k_proj = nn.Linear(embed_dim, self.squeeze_dim, bias=bias)
self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
def _shape_qk(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.squeeze_head_dim).transpose(1, 2).contiguous()
def _shape_v(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
def forward(
self,
hidden_states: torch.Tensor,
key_value_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
"""Input shape: Batch x Time x Channel"""
# if key_value_states are provided this layer is used as a cross-attention layer
# for the decoder
is_cross_attention = key_value_states is not None
bsz, tgt_len, _ = hidden_states.size()
# get query proj
query_states = self.q_proj(hidden_states) * self.scaling
# get key, value proj
# `past_key_value[0].shape[2] == key_value_states.shape[1]`
# is checking that the `sequence_length` of the `past_key_value` is the same as
# the provided `key_value_states` to support prefix tuning
if (
is_cross_attention
and past_key_value is not None
and past_key_value[0].shape[2] == key_value_states.shape[1]
):
# reuse k,v, cross_attentions
key_states = past_key_value[0]
value_states = past_key_value[1]
elif is_cross_attention:
# cross_attentions
key_states = self._shape_qk(self.k_proj(key_value_states), -1, bsz)
value_states = self._shape_v(self.v_proj(key_value_states), -1, bsz)
elif past_key_value is not None:
# reuse k, v, self_attention
key_states = self._shape_qk(self.k_proj(hidden_states), -1, bsz)
value_states = self._shape_v(self.v_proj(hidden_states), -1, bsz)
key_states = torch.cat([past_key_value[0], key_states], dim=2)
value_states = torch.cat([past_key_value[1], value_states], dim=2)
else:
# self_attention
key_states = self._shape_qk(self.k_proj(hidden_states), -1, bsz)
value_states = self._shape_v(self.v_proj(hidden_states), -1, bsz)
if self.is_decoder:
# if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
# Further calls to cross_attention layer can then reuse all cross-attention
# key/value_states (first "if" case)
# if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
# all previous decoder key/value_states. Further calls to uni-directional self-attention
# can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
# if encoder bi-directional self-attention `past_key_value` is always `None`
past_key_value = (key_states, value_states)
proj_shape = (bsz * self.num_heads, -1, self.squeeze_head_dim)
value_shape = (bsz * self.num_heads, -1, self.head_dim)
query_states = self._shape_qk(query_states, tgt_len, bsz).view(*proj_shape)
key_states = key_states.reshape(*proj_shape)
value_states = value_states.reshape(*value_shape)
src_len = key_states.size(1)
attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
raise ValueError(
f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
f" {attn_weights.size()}"
)
if attention_mask is not None:
if attention_mask.size() != (bsz, 1, tgt_len, src_len):
raise ValueError(
f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
)
attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
attn_weights = nn.functional.softmax(attn_weights, dim=-1)
if layer_head_mask is not None:
if layer_head_mask.size() != (self.num_heads,):
raise ValueError(
f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
f" {layer_head_mask.size()}"
)
attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
if output_attentions:
# this operation is a bit awkward, but it's required to
# make sure that attn_weights keeps its gradient.
# In order to do so, attn_weights have to be reshaped
# twice and have to be reused in the following
attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
else:
attn_weights_reshaped = None
attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
attn_output = torch.bmm(attn_probs, value_states)
if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
raise ValueError(
f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
f" {attn_output.size()}"
)
attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
attn_output = attn_output.transpose(1, 2)
# Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
# partitioned across GPUs when using tensor-parallelism.
attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
attn_output = self.out_proj(attn_output)
return attn_output, attn_weights_reshaped, past_key_value
# Copied from transformers.models.bart.modeling_bart.BartFlashAttention2 with Bart->MBart
class UnimerMBartFlashAttention2(UnimerMBartAttention):
"""
MBart flash attention module. This module inherits from `MBartSqueezeAttention` as the weights of the module stays
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
flash attention and deal with padding tokens in case the input contains any of them.
"""
# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
# Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
# def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
# return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
def _shape_qk(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.squeeze_head_dim)
def _shape_v(self, tensor: torch.Tensor, seq_len: int, bsz: int):
return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
def forward(
self,
hidden_states: torch.Tensor,
key_value_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
# MBartFlashAttention2 attention does not support output_attentions
if output_attentions:
raise ValueError("MBartFlashAttention2 attention does not support output_attentions")
# if key_value_states are provided this layer is used as a cross-attention layer
# for the decoder
is_cross_attention = key_value_states is not None
bsz, q_len, _ = hidden_states.size()
# get query proj
query_states = self._shape_qk(self.q_proj(hidden_states), -1, bsz)
# get key, value proj
# `past_key_value[0].shape[2] == key_value_states.shape[1]`
# is checking that the `sequence_length` of the `past_key_value` is the same as
# the provided `key_value_states` to support prefix tuning
if (
is_cross_attention
and past_key_value is not None
and past_key_value[0].shape[2] == key_value_states.shape[1]
):
# reuse k,v, cross_attentions
key_states = past_key_value[0].transpose(1, 2)
value_states = past_key_value[1].transpose(1, 2)
elif is_cross_attention:
# cross_attentions
key_states = self._shape_qk(self.k_proj(key_value_states), -1, bsz)
value_states = self._shape_v(self.v_proj(key_value_states), -1, bsz)
elif past_key_value is not None:
# reuse k, v, self_attention
key_states = self._shape_qk(self.k_proj(hidden_states), -1, bsz)
value_states = self._shape_v(self.v_proj(hidden_states), -1, bsz)
key_states = torch.cat([past_key_value[0].transpose(1, 2), key_states], dim=1)
value_states = torch.cat([past_key_value[1].transpose(1, 2), value_states], dim=1)
else:
# self_attention
key_states = self._shape_qk(self.k_proj(hidden_states), -1, bsz)
value_states = self._shape_v(self.v_proj(hidden_states), -1, bsz)
if self.is_decoder:
# if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
# Further calls to cross_attention layer can then reuse all cross-attention
# key/value_states (first "if" case)
# if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
# all previous decoder key/value_states. Further calls to uni-directional self-attention
# can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
# if encoder bi-directional self-attention `past_key_value` is always `None`
past_key_value = (key_states.transpose(1, 2), value_states.transpose(1, 2))
kv_seq_len = key_states.shape[-2]
if past_key_value is not None:
kv_seq_len += past_key_value[0].shape[-2]
# In PEFT, usually we cast the layer norms in float32 for training stability reasons
# therefore the input hidden states gets silently casted in float32. Hence, we need
# cast them back in the correct dtype just to be sure everything works as expected.
# This might slowdown training & inference so it is recommended to not cast the LayerNorms
# in fp32. (LlamaRMSNorm handles it correctly)
input_dtype = query_states.dtype
if input_dtype == torch.float32:
if torch.is_autocast_enabled():
target_dtype = torch.get_autocast_gpu_dtype()
# Handle the case where the model is quantized
elif hasattr(self.config, "_pre_quantization_dtype"):
target_dtype = self.config._pre_quantization_dtype
else:
target_dtype = self.q_proj.weight.dtype
logger.warning_once(
f"The input hidden states seems to be silently casted in float32, this might be related to"
f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
f" {target_dtype}."
)
query_states = query_states.to(target_dtype)
key_states = key_states.to(target_dtype)
value_states = value_states.to(target_dtype)
attn_output = self._flash_attention_forward(
query_states, key_states, value_states, attention_mask, q_len, dropout=self.dropout
)
attn_output = attn_output.reshape(bsz, q_len, -1)
attn_output = self.out_proj(attn_output)
if not output_attentions:
attn_weights = None
return attn_output, attn_weights, past_key_value
# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
def _flash_attention_forward(
self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
):
"""
Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
first unpad the input, then computes the attention scores and pad the final attention scores.
Args:
query_states (`torch.Tensor`):
Input query states to be passed to Flash Attention API
key_states (`torch.Tensor`):
Input key states to be passed to Flash Attention API
value_states (`torch.Tensor`):
Input value states to be passed to Flash Attention API
attention_mask (`torch.Tensor`):
The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
position of padding tokens and 1 for the position of non-padding tokens.
dropout (`float`):
Attention dropout
softmax_scale (`float`, *optional*):
The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
"""
if not self._flash_attn_uses_top_left_mask:
causal = self.is_causal
else:
# TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
causal = self.is_causal and query_length != 1
# Contains at least one padding token in the sequence
if attention_mask is not None:
batch_size = query_states.shape[0]
query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
query_states, key_states, value_states, attention_mask, query_length
)
cu_seqlens_q, cu_seqlens_k = cu_seq_lens
max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
attn_output_unpad = flash_attn_varlen_func(
query_states,
key_states,
value_states,
cu_seqlens_q=cu_seqlens_q,
cu_seqlens_k=cu_seqlens_k,
max_seqlen_q=max_seqlen_in_batch_q,
max_seqlen_k=max_seqlen_in_batch_k,
dropout_p=dropout,
softmax_scale=softmax_scale,
causal=causal,
)
attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
else:
attn_output = flash_attn_func(
query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
)
return attn_output
# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input
def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
key_layer = index_first_axis(
key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
)
value_layer = index_first_axis(
value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
)
if query_length == kv_seq_len:
query_layer = index_first_axis(
query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
)
cu_seqlens_q = cu_seqlens_k
max_seqlen_in_batch_q = max_seqlen_in_batch_k
indices_q = indices_k
elif query_length == 1:
max_seqlen_in_batch_q = 1
cu_seqlens_q = torch.arange(
batch_size + 1, dtype=torch.int32, device=query_layer.device
) # There is a memcpy here, that is very bad.
indices_q = cu_seqlens_q[:-1]
query_layer = query_layer.squeeze(1)
else:
# The -q_len: slice assumes left padding.
attention_mask = attention_mask[:, -query_length:]
query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
return (
query_layer,
key_layer,
value_layer,
indices_q,
(cu_seqlens_q, cu_seqlens_k),
(max_seqlen_in_batch_q, max_seqlen_in_batch_k),
)
class UnimerMBartSdpaAttention(UnimerMBartAttention):
def forward(
self,
hidden_states: torch.Tensor,
key_value_states: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
output_attentions: bool = False,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
"""Input shape: Batch x Time x Channel"""
if output_attentions or layer_head_mask is not None:
# TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once this is implemented.
logger.warning(
"BartModel is using BartSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention"
' implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
)
return super().forward(
hidden_states,
key_value_states=key_value_states,
past_key_value=past_key_value,
attention_mask=attention_mask,
layer_head_mask=layer_head_mask,
output_attentions=output_attentions,
)
# if key_value_states are provided this layer is used as a cross-attention layer
# for the decoder
is_cross_attention = key_value_states is not None
bsz, tgt_len, _ = hidden_states.size()
# get query proj
query_states = self.q_proj(hidden_states)
# get key, value proj
# `past_key_value[0].shape[2] == key_value_states.shape[1]`
# is checking that the `sequence_length` of the `past_key_value` is the same as
# the provided `key_value_states` to support prefix tuning
if (
is_cross_attention
and past_key_value is not None
and past_key_value[0].shape[2] == key_value_states.shape[1]
):
# reuse k,v, cross_attentions
key_states = past_key_value[0]
value_states = past_key_value[1]
elif is_cross_attention:
# cross_attentions
key_states = self._shape_qk(self.k_proj(key_value_states), -1, bsz)
value_states = self._shape_v(self.v_proj(key_value_states), -1, bsz)
elif past_key_value is not None:
# reuse k, v, self_attention
key_states = self._shape_qk(self.k_proj(hidden_states), -1, bsz)
value_states = self._shape_v(self.v_proj(hidden_states), -1, bsz)
key_states = torch.cat([past_key_value[0], key_states], dim=2)
value_states = torch.cat([past_key_value[1], value_states], dim=2)
else:
# self_attention
key_states = self._shape_qk(self.k_proj(hidden_states), -1, bsz)
value_states = self._shape_v(self.v_proj(hidden_states), -1, bsz)
if self.is_decoder:
# if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
# Further calls to cross_attention layer can then reuse all cross-attention
# key/value_states (first "if" case)
# if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
# all previous decoder key/value_states. Further calls to uni-directional self-attention
# can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
# if encoder bi-directional self-attention `past_key_value` is always `None`
past_key_value = (key_states, value_states)
query_states = self._shape_qk(query_states, tgt_len, bsz)
# We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
# in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
# The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False
# NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
# but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
attn_output = torch.nn.functional.scaled_dot_product_attention(
query_states,
key_states,
value_states,
attn_mask=attention_mask,
dropout_p=self.dropout if self.training else 0.0,
is_causal=is_causal,
)
if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
raise ValueError(
f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
f" {attn_output.size()}"
)
attn_output = attn_output.transpose(1, 2)
# Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
# partitioned across GPUs when using tensor-parallelism.
attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
attn_output = self.out_proj(attn_output)
return attn_output, None, past_key_value
UNIMER_MBART_ATTENTION_CLASSES = {
"eager": UnimerMBartAttention,
"flash_attention_2": UnimerMBartFlashAttention2,
"sdpa": UnimerMBartSdpaAttention,
}
class UnimerMBartEncoderLayer(nn.Module):
def __init__(self, config: UnimerMBartConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = UNIMER_MBART_ATTENTION_CLASSES[config._attn_implementation](
embed_dim=self.embed_dim,
num_heads=config.encoder_attention_heads,
dropout=config.attention_dropout,
config=config,
)
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: torch.Tensor,
layer_head_mask: torch.Tensor,
output_attentions: bool = False,
) -> torch.Tensor:
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
`(encoder_attention_heads,)`.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
hidden_states, attn_weights, _ = self.self_attn(
hidden_states=hidden_states,
attention_mask=attention_mask,
layer_head_mask=layer_head_mask,
output_attentions=output_attentions,
)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
residual = hidden_states
hidden_states = self.final_layer_norm(hidden_states)
hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
hidden_states = self.fc2(hidden_states)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
if hidden_states.dtype == torch.float16 and (
torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
):
clamp_value = torch.finfo(hidden_states.dtype).max - 1000
hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
outputs = (hidden_states,)
if output_attentions:
outputs += (attn_weights,)
return outputs
class UnimerMBartDecoderLayer(nn.Module):
def __init__(self, config: UnimerMBartConfig):
super().__init__()
self.embed_dim = config.d_model
self.self_attn = UNIMER_MBART_ATTENTION_CLASSES[config._attn_implementation](
embed_dim=self.embed_dim,
num_heads=config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
is_causal=True,
config=config,
)
self.dropout = config.dropout
self.activation_fn = ACT2FN[config.activation_function]
self.activation_dropout = config.activation_dropout
self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.encoder_attn = UNIMER_MBART_ATTENTION_CLASSES[config._attn_implementation](
self.embed_dim,
config.decoder_attention_heads,
dropout=config.attention_dropout,
is_decoder=True,
config=config,
)
self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
self.final_layer_norm = nn.LayerNorm(self.embed_dim)
def forward(
self,
hidden_states: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.Tensor] = None,
encoder_attention_mask: Optional[torch.Tensor] = None,
layer_head_mask: Optional[torch.Tensor] = None,
cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
past_key_value: Optional[Tuple[torch.Tensor]] = None,
output_attentions: Optional[bool] = False,
use_cache: Optional[bool] = True,
) -> torch.Tensor:
"""
Args:
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
attention_mask (`torch.FloatTensor`): attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
encoder_hidden_states (`torch.FloatTensor`):
cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
`(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
`(encoder_attention_heads,)`.
cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
size `(decoder_attention_heads,)`.
past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
"""
residual = hidden_states
hidden_states = self.self_attn_layer_norm(hidden_states)
# Self Attention
# decoder uni-directional self-attention cached key/values tuple is at positions 1,2
self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
# add present self-attn cache to positions 1,2 of present_key_value tuple
hidden_states, self_attn_weights, present_key_value = self.self_attn(
hidden_states=hidden_states,
past_key_value=self_attn_past_key_value,
attention_mask=attention_mask,
layer_head_mask=layer_head_mask,
output_attentions=output_attentions,
)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
# Cross-Attention Block
cross_attn_present_key_value = None
cross_attn_weights = None
if encoder_hidden_states is not None:
residual = hidden_states
hidden_states = self.encoder_attn_layer_norm(hidden_states)
# cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
hidden_states=hidden_states,
key_value_states=encoder_hidden_states,
attention_mask=encoder_attention_mask,
layer_head_mask=cross_attn_layer_head_mask,
past_key_value=cross_attn_past_key_value,
output_attentions=output_attentions,
)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
# add cross-attn to positions 3,4 of present_key_value tuple
present_key_value = present_key_value + cross_attn_present_key_value
# Fully Connected
residual = hidden_states
hidden_states = self.final_layer_norm(hidden_states)
hidden_states = self.activation_fn(self.fc1(hidden_states))
hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
hidden_states = self.fc2(hidden_states)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
hidden_states = residual + hidden_states
outputs = (hidden_states,)
if output_attentions:
outputs += (self_attn_weights, cross_attn_weights)
if use_cache:
outputs += (present_key_value,)
return outputs
# Copied from transformers.models.bart.modeling_bart.BartClassificationHead with Bart->MBart
class UnimerMBartClassificationHead(nn.Module):
"""Head for sentence-level classification tasks."""
def __init__(
self,
input_dim: int,
inner_dim: int,
num_classes: int,
pooler_dropout: float,
):
super().__init__()
self.dense = nn.Linear(input_dim, inner_dim)
self.dropout = nn.Dropout(p=pooler_dropout)
self.out_proj = nn.Linear(inner_dim, num_classes)
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
hidden_states = self.dropout(hidden_states)
hidden_states = self.dense(hidden_states)
hidden_states = torch.tanh(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.out_proj(hidden_states)
return hidden_states
class UnimerMBartPreTrainedModel(PreTrainedModel):
config_class = UnimerMBartConfig
base_model_prefix = "model"
supports_gradient_checkpointing = True
_no_split_modules = ["MBartDecoderLayer", "MBartSqueezeAttention"]
_supports_flash_attn_2 = True
_supports_sdpa = True
def _init_weights(self, module):
std = self.config.init_std
if isinstance(module, nn.Linear):
module.weight.data.normal_(mean=0.0, std=std)
if module.bias is not None:
module.bias.data.zero_()
elif isinstance(module, nn.Embedding):
module.weight.data.normal_(mean=0.0, std=std)
if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
@property
def dummy_inputs(self):
pad_token = self.config.pad_token_id
input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
dummy_inputs = {
"attention_mask": input_ids.ne(pad_token),
"input_ids": input_ids,
}
return dummy_inputs
MBART_START_DOCSTRING = r"""
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
etc.)
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
and behavior.
Parameters:
config ([`MBartConfig`]):
Model configuration class with all the parameters of the model. Initializing with a config file does not
load the weights associated with the model, only the configuration. Check out the
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
"""
MBART_GENERATION_EXAMPLE = r"""
Translation example:
```python
>>> from transformers import AutoTokenizer, MBartForConditionalGeneration
>>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-en-ro")
>>> example_english_phrase = "42 is the answer"
>>> inputs = tokenizer(example_english_phrase, return_tensors="pt")
>>> # Translate
>>> generated_ids = model.generate(**inputs, num_beams=4, max_length=5)
>>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
'42 este răspuns'
```
Mask filling example:
```python
>>> from transformers import AutoTokenizer, MBartForConditionalGeneration
>>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-cc25")
>>> # de_DE is the language symbol id <LID> for German
>>> TXT = "</s> Meine Freunde sind <mask> nett aber sie essen zu viel Kuchen. </s> de_DE"
>>> input_ids = tokenizer([TXT], add_special_tokens=False, return_tensors="pt")["input_ids"]
>>> logits = model(input_ids).logits
>>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
>>> probs = logits[0, masked_index].softmax(dim=0)
>>> values, predictions = probs.topk(5)
>>> tokenizer.decode(predictions).split()
['nett', 'sehr', 'ganz', 'nicht', 'so']
```
"""
MBART_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Indices of decoder input sequence tokens in the vocabulary.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are decoder input IDs?](../glossary#decoder-input-ids)
MBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
varies according to source and target language, *e.g.* 25004 for *en_XX*, and 25003 for *de_DE*. If
`past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
`past_key_values`).
For translation and summarization training, `decoder_input_ids` should be provided. If no
`decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
for denoising pre-training following the paper.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
be used by default.
head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
`last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
`decoder_input_ids` of shape `(batch_size, sequence_length)`.
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
input (see `past_key_values`). This is useful if you want more control over how to convert
`decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
of `inputs_embeds`.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
`past_key_values`).
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
class UnimerMBartEncoder(UnimerMBartPreTrainedModel):
"""
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
[`MBartEncoderLayer`].
Args:
config: MBartConfig
embed_tokens (nn.Embedding): output embedding
"""
def __init__(self, config: UnimerMBartConfig, embed_tokens: Optional[nn.Embedding] = None):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.encoder_layerdrop
embed_dim = config.d_model
self.padding_idx = config.pad_token_id
self.max_source_positions = config.max_position_embeddings
embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
self.embed_tokens = UnimerMBartScaledWordEmbedding(
config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale
)
if embed_tokens is not None:
self.embed_tokens.weight = embed_tokens.weight
self.embed_positions = UnimerMBartLearnedPositionalEmbedding(
config.max_position_embeddings,
embed_dim,
)
self.layers = nn.ModuleList([UnimerMBartEncoderLayer(config) for _ in range(config.encoder_layers)])
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
self._use_sdpa = config._attn_implementation == "sdpa"
self.layernorm_embedding = nn.LayerNorm(embed_dim)
self.layer_norm = nn.LayerNorm(config.d_model)
self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def _backward_compatibility_gradient_checkpointing(self):
# Override to not delete the attribute from the config
if self.supports_gradient_checkpointing and getattr(self.config, "gradient_checkpointing", False):
self.gradient_checkpointing_enable()
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
head_mask: Optional[torch.Tensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutput]:
r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# retrieve input_ids and inputs_embeds
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
elif input_ids is not None:
input = input_ids
input_shape = input.shape
input_ids = input_ids.view(-1, input_shape[-1])
elif inputs_embeds is not None:
input = inputs_embeds[:, :, -1]
else:
raise ValueError("You have to specify either input_ids or inputs_embeds")
if inputs_embeds is None:
inputs_embeds = self.embed_tokens(input_ids)
embed_pos = self.embed_positions(input)
hidden_states = inputs_embeds + embed_pos.to(inputs_embeds.device)
hidden_states = self.layernorm_embedding(hidden_states)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
# expand attention_mask
if attention_mask is not None:
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
if self._use_flash_attention_2:
attention_mask = attention_mask if 0 in attention_mask else None
elif self._use_sdpa and head_mask is None and not output_attentions:
# output_attentions=True & head_mask can not be supported when using SDPA, fall back to
# the manual implementation that requires a 4D causal mask in all cases.
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype)
else:
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
encoder_states = () if output_hidden_states else None
all_attentions = () if output_attentions else None
# check if head_mask has a correct number of layers specified if desired
if head_mask is not None:
if head_mask.size()[0] != len(self.layers):
raise ValueError(
f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
f" {head_mask.size()[0]}."
)
for idx, encoder_layer in enumerate(self.layers):
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
to_drop = False
if self.training:
dropout_probability = torch.rand([])
if dropout_probability < self.layerdrop: # skip the layer
to_drop = True
if to_drop:
layer_outputs = (None, None)
else:
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
encoder_layer.__call__,
hidden_states,
attention_mask,
(head_mask[idx] if head_mask is not None else None),
output_attentions,
)
else:
layer_outputs = encoder_layer(
hidden_states,
attention_mask,
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
output_attentions=output_attentions,
)
hidden_states = layer_outputs[0]
if output_attentions:
all_attentions = all_attentions + (layer_outputs[1],)
hidden_states = self.layer_norm(hidden_states)
if output_hidden_states:
encoder_states = encoder_states + (hidden_states,)
if not return_dict:
return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
return BaseModelOutput(
last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
)
class UnimerMBartDecoder(UnimerMBartPreTrainedModel):
"""
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MBartDecoderLayer`]
Args:
config: MBartConfig
embed_tokens (nn.Embedding): output embedding
"""
def __init__(self, config: UnimerMBartConfig, embed_tokens: Optional[nn.Embedding] = None):
super().__init__(config)
self.dropout = config.dropout
self.layerdrop = config.decoder_layerdrop
self.padding_idx = config.pad_token_id
self.max_target_positions = config.max_position_embeddings
embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
self.embed_tokens = UnimerMBartScaledWordEmbedding(
config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale
)
if embed_tokens is not None:
self.embed_tokens.weight = embed_tokens.weight
self.embed_positions = UnimerMBartLearnedPositionalEmbedding(
config.max_position_embeddings,
config.d_model,
)
self.layers = nn.ModuleList([UnimerMBartDecoderLayer(config) for _ in range(config.decoder_layers)])
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
self._use_sdpa = config._attn_implementation == "sdpa"
self.layernorm_embedding = nn.LayerNorm(config.d_model)
self.layer_norm = nn.LayerNorm(config.d_model)
self.gradient_checkpointing = False
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.embed_tokens
def set_input_embeddings(self, value):
self.embed_tokens = value
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
count_pred: Optional[torch.FloatTensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
of the decoder.
encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
cross-attention on hidden heads. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert `input_ids` indices into associated vectors
than the model's internal embedding lookup matrix.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# retrieve input_ids and inputs_embeds
if input_ids is not None and inputs_embeds is not None:
raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
elif input_ids is not None:
input = input_ids
input_shape = input.size()
input_ids = input_ids.view(-1, input_shape[-1])
elif inputs_embeds is not None:
input_shape = inputs_embeds.size()[:-1]
input = inputs_embeds[:, :, -1]
else:
raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
# past_key_values_length
past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
if inputs_embeds is None:
inputs_embeds = self.embed_tokens(input_ids)
if self._use_flash_attention_2:
# 2d mask is passed through the layers
attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
elif self._use_sdpa and not output_attentions and cross_attn_head_mask is None:
# output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
# the manual implementation that requires a 4D causal mask in all cases.
attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
attention_mask,
input_shape,
inputs_embeds,
past_key_values_length,
)
else:
# 4d mask is passed through the layers
attention_mask = _prepare_4d_causal_attention_mask(
attention_mask, input_shape, inputs_embeds, past_key_values_length
)
# expand encoder attention mask
if encoder_hidden_states is not None and encoder_attention_mask is not None:
if self._use_flash_attention_2:
encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None
elif self._use_sdpa and cross_attn_head_mask is None and not output_attentions:
# output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
# the manual implementation that requires a 4D causal mask in all cases.
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
encoder_attention_mask,
inputs_embeds.dtype,
tgt_len=input_shape[-1],
)
else:
# [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
encoder_attention_mask = _prepare_4d_attention_mask(
encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
)
# embed positions
positions = self.embed_positions(input, past_key_values_length)
hidden_states = inputs_embeds + positions.to(inputs_embeds.device)
# TODO: add counting context weight to hidden_states
if count_pred is not None:
count_context_weight = self.counting_context_weight(count_pred)
hidden_states = hidden_states + 0.5 * count_context_weight.unsqueeze(1)
hidden_states = self.layernorm_embedding(hidden_states)
hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
if self.gradient_checkpointing and self.training:
if use_cache:
logger.warning_once(
"`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`..."
)
use_cache = False
# decoder layers
all_hidden_states = () if output_hidden_states else None
all_self_attns = () if output_attentions else None
all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
next_decoder_cache = () if use_cache else None
# check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
if attn_mask is not None:
if attn_mask.size()[0] != len(self.layers):
raise ValueError(
f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
f" {attn_mask.size()[0]}."
)
for idx, decoder_layer in enumerate(self.layers):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
if output_hidden_states:
all_hidden_states += (hidden_states,)
if self.training:
dropout_probability = torch.rand([])
if dropout_probability < self.layerdrop:
continue
past_key_value = past_key_values[idx] if past_key_values is not None else None
if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
decoder_layer.__call__,
hidden_states,
attention_mask,
encoder_hidden_states,
encoder_attention_mask,
head_mask[idx] if head_mask is not None else None,
cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
None,
output_attentions,
use_cache,
)
else:
layer_outputs = decoder_layer(
hidden_states,
attention_mask=attention_mask,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
layer_head_mask=(head_mask[idx] if head_mask is not None else None),
cross_attn_layer_head_mask=(
cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
),
past_key_value=past_key_value,
output_attentions=output_attentions,
use_cache=use_cache,
)
hidden_states = layer_outputs[0]
if use_cache:
next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
if output_attentions:
all_self_attns += (layer_outputs[1],)
if encoder_hidden_states is not None:
all_cross_attentions += (layer_outputs[2],)
hidden_states = self.layer_norm(hidden_states)
# add hidden states from the last decoder layer
if output_hidden_states:
all_hidden_states += (hidden_states,)
next_cache = next_decoder_cache if use_cache else None
if not return_dict:
return tuple(
v
for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
if v is not None
)
return BaseModelOutputWithPastAndCrossAttentions(
last_hidden_state=hidden_states,
past_key_values=next_cache,
hidden_states=all_hidden_states,
attentions=all_self_attns,
cross_attentions=all_cross_attentions,
)
@add_start_docstrings(
"The bare MBART Model outputting raw hidden-states without any specific head on top.",
MBART_START_DOCSTRING,
)
class UnimerMBartModel(UnimerMBartPreTrainedModel):
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
def __init__(self, config: UnimerMBartConfig):
super().__init__(config)
padding_idx, vocab_size = config.pad_token_id, config.vocab_size
self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
self.encoder = UnimerMBartEncoder(config, self.shared)
self.decoder = UnimerMBartDecoder(config, self.shared)
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.shared
def set_input_embeddings(self, value):
self.shared = value
self.encoder.embed_tokens = self.shared
self.decoder.embed_tokens = self.shared
def get_encoder(self):
return self.encoder
def get_decoder(self):
return self.decoder
def _tie_weights(self):
if self.config.tie_word_embeddings:
self._tie_or_clone_weights(self.encoder.embed_tokens, self.get_input_embeddings())
self._tie_or_clone_weights(self.decoder.embed_tokens, self.get_input_embeddings())
@add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=Seq2SeqModelOutput,
config_class=_CONFIG_FOR_DOC,
expected_output=_EXPECTED_OUTPUT_SHAPE,
)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Seq2SeqModelOutput, Tuple[torch.FloatTensor]]:
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
# different to other models, MBart automatically creates decoder_input_ids from
# input_ids if no decoder_input_ids are provided
if decoder_input_ids is None and decoder_inputs_embeds is None:
decoder_input_ids = shift_tokens_right(input_ids, self.config.pad_token_id)
if encoder_outputs is None:
encoder_outputs = self.encoder(
input_ids=input_ids,
attention_mask=attention_mask,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
# If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
encoder_outputs = BaseModelOutput(
last_hidden_state=encoder_outputs[0],
hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
)
# decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
decoder_outputs = self.decoder(
input_ids=decoder_input_ids,
attention_mask=decoder_attention_mask,
encoder_hidden_states=encoder_outputs[0],
encoder_attention_mask=attention_mask,
head_mask=decoder_head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
inputs_embeds=decoder_inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
if not return_dict:
return decoder_outputs + encoder_outputs
return Seq2SeqModelOutput(
last_hidden_state=decoder_outputs.last_hidden_state,
past_key_values=decoder_outputs.past_key_values,
decoder_hidden_states=decoder_outputs.hidden_states,
decoder_attentions=decoder_outputs.attentions,
cross_attentions=decoder_outputs.cross_attentions,
encoder_last_hidden_state=encoder_outputs.last_hidden_state,
encoder_hidden_states=encoder_outputs.hidden_states,
encoder_attentions=encoder_outputs.attentions,
)
@add_start_docstrings(
"The MBART Model with a language modeling head. Can be used for summarization, after fine-tuning the pretrained models.",
MBART_START_DOCSTRING,
)
class UnimerMBartForConditionalGeneration(UnimerMBartPreTrainedModel, GenerationMixin):
base_model_prefix = "model"
_keys_to_ignore_on_load_missing = ["final_logits_bias"]
_tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight", "lm_head.weight"]
def __init__(self, config: UnimerMBartConfig):
super().__init__(config)
self.model = UnimerMBartModel(config)
self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
# Initialize weights and apply final processing
self.post_init()
def get_encoder(self):
return self.model.get_encoder()
def get_decoder(self):
return self.model.get_decoder()
def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
self._resize_final_logits_bias(new_embeddings.weight.shape[0])
return new_embeddings
def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
old_num_tokens = self.final_logits_bias.shape[-1]
if new_num_tokens <= old_num_tokens:
new_bias = self.final_logits_bias[:, :new_num_tokens]
else:
extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
self.register_buffer("final_logits_bias", new_bias)
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
@add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING)
@replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
@add_end_docstrings(MBART_GENERATION_EXAMPLE)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Seq2SeqLMOutput, Tuple[torch.FloatTensor]]:
r"""
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
Returns:
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
if use_cache:
logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
use_cache = False
if decoder_input_ids is None and decoder_inputs_embeds is None:
decoder_input_ids = shift_tokens_right(labels, self.config.pad_token_id)
outputs = self.model(
input_ids,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
encoder_outputs=encoder_outputs,
decoder_attention_mask=decoder_attention_mask,
head_mask=head_mask,
decoder_head_mask=decoder_head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
decoder_inputs_embeds=decoder_inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias
masked_lm_loss = None
if labels is not None:
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
if not return_dict:
output = (lm_logits,) + outputs[1:]
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
return Seq2SeqLMOutput(
loss=masked_lm_loss,
logits=lm_logits,
past_key_values=outputs.past_key_values,
decoder_hidden_states=outputs.decoder_hidden_states,
decoder_attentions=outputs.decoder_attentions,
cross_attentions=outputs.cross_attentions,
encoder_last_hidden_state=outputs.encoder_last_hidden_state,
encoder_hidden_states=outputs.encoder_hidden_states,
encoder_attentions=outputs.encoder_attentions,
)
def prepare_inputs_for_generation(
self,
decoder_input_ids,
past_key_values=None,
attention_mask=None,
head_mask=None,
decoder_head_mask=None,
cross_attn_head_mask=None,
use_cache=None,
encoder_outputs=None,
**kwargs,
):
# cut decoder_input_ids if past is used
if past_key_values is not None:
past_length = past_key_values[0][0].shape[2]
# Some generation methods already pass only the last input ID
if decoder_input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
# Default to old behavior: keep only final ID
remove_prefix_length = decoder_input_ids.shape[1] - 1
decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]
return {
"input_ids": None, # encoder_outputs is defined. input_ids not needed
"encoder_outputs": encoder_outputs,
"past_key_values": past_key_values,
"decoder_input_ids": decoder_input_ids,
"attention_mask": attention_mask,
"head_mask": head_mask,
"decoder_head_mask": decoder_head_mask,
"cross_attn_head_mask": cross_attn_head_mask,
"use_cache": use_cache, # change this to avoid caching (presumably for debugging)
}
def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
return shift_tokens_right(labels, self.config.pad_token_id)
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
# cached cross_attention states don't have to be reordered -> they are always the same
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+ layer_past[2:],
)
return reordered_past
@add_start_docstrings(
"""
MBart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
tasks.
""",
MBART_START_DOCSTRING,
)
class UnimerMBartForSequenceClassification(UnimerMBartPreTrainedModel):
_tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight"]
def __init__(self, config: UnimerMBartConfig, **kwargs):
super().__init__(config, **kwargs)
self.model = UnimerMBartModel(config)
self.classification_head = UnimerMBartClassificationHead(
config.d_model,
config.d_model,
config.num_labels,
config.classifier_dropout,
)
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=Seq2SeqSequenceClassifierOutput,
config_class=_CONFIG_FOR_DOC,
)
# Copied from transformers.models.bart.modeling_bart.BartForSequenceClassification.forward
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, Seq2SeqSequenceClassifierOutput]:
r"""
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if labels is not None:
use_cache = False
if input_ids is None and inputs_embeds is not None:
raise NotImplementedError(
f"Passing input embeddings is currently not supported for {self.__class__.__name__}"
)
outputs = self.model(
input_ids,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
head_mask=head_mask,
decoder_head_mask=decoder_head_mask,
cross_attn_head_mask=cross_attn_head_mask,
encoder_outputs=encoder_outputs,
inputs_embeds=inputs_embeds,
decoder_inputs_embeds=decoder_inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = outputs[0] # last hidden state
eos_mask = input_ids.eq(self.config.eos_token_id).to(hidden_states.device)
if len(torch.unique_consecutive(eos_mask.sum(1))) > 1:
raise ValueError("All examples must have the same number of <eos> tokens.")
sentence_representation = hidden_states[eos_mask, :].view(hidden_states.size(0), -1, hidden_states.size(-1))[
:, -1, :
]
logits = self.classification_head(sentence_representation)
loss = None
if labels is not None:
labels = labels.to(logits.device)
if self.config.problem_type is None:
if self.config.num_labels == 1:
self.config.problem_type = "regression"
elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
if self.config.num_labels == 1:
loss = loss_fct(logits.squeeze(), labels.squeeze())
else:
loss = loss_fct(logits, labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[1:]
return ((loss,) + output) if loss is not None else output
return Seq2SeqSequenceClassifierOutput(
loss=loss,
logits=logits,
past_key_values=outputs.past_key_values,
decoder_hidden_states=outputs.decoder_hidden_states,
decoder_attentions=outputs.decoder_attentions,
cross_attentions=outputs.cross_attentions,
encoder_last_hidden_state=outputs.encoder_last_hidden_state,
encoder_hidden_states=outputs.encoder_hidden_states,
encoder_attentions=outputs.encoder_attentions,
)
@add_start_docstrings(
"""
MBART Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
""",
MBART_START_DOCSTRING,
)
class UnimerMBartForQuestionAnswering(UnimerMBartPreTrainedModel):
_tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight"]
def __init__(self, config):
super().__init__(config)
config.num_labels = 2
self.num_labels = config.num_labels
self.model = UnimerMBartModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
# Initialize weights and apply final processing
self.post_init()
@add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING)
@add_code_sample_docstrings(
checkpoint=_CHECKPOINT_FOR_DOC,
output_type=Seq2SeqQuestionAnsweringModelOutput,
config_class=_CONFIG_FOR_DOC,
)
# Copied from transformers.models.bart.modeling_bart.BartForQuestionAnswering.forward
def forward(
self,
input_ids: torch.Tensor = None,
attention_mask: Optional[torch.Tensor] = None,
decoder_input_ids: Optional[torch.LongTensor] = None,
decoder_attention_mask: Optional[torch.LongTensor] = None,
head_mask: Optional[torch.Tensor] = None,
decoder_head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
encoder_outputs: Optional[List[torch.FloatTensor]] = None,
start_positions: Optional[torch.LongTensor] = None,
end_positions: Optional[torch.LongTensor] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
) -> Union[Tuple, Seq2SeqQuestionAnsweringModelOutput]:
r"""
start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the start of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
are not taken into account for computing the loss.
end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
Labels for position (index) of the end of the labelled span for computing the token classification loss.
Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
are not taken into account for computing the loss.
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
if start_positions is not None and end_positions is not None:
use_cache = False
outputs = self.model(
input_ids,
attention_mask=attention_mask,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
head_mask=head_mask,
decoder_head_mask=decoder_head_mask,
cross_attn_head_mask=cross_attn_head_mask,
encoder_outputs=encoder_outputs,
inputs_embeds=inputs_embeds,
decoder_inputs_embeds=decoder_inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
sequence_output = outputs[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1).contiguous()
end_logits = end_logits.squeeze(-1).contiguous()
total_loss = None
if start_positions is not None and end_positions is not None:
# If we are on multi-GPU, split add a dimension
if len(start_positions.size()) > 1:
start_positions = start_positions.squeeze(-1)
if len(end_positions.size()) > 1:
end_positions = end_positions.squeeze(-1)
# sometimes the start/end positions are outside our model inputs, we ignore these terms
ignored_index = start_logits.size(1)
start_positions = start_positions.clamp(0, ignored_index)
end_positions = end_positions.clamp(0, ignored_index)
loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
start_loss = loss_fct(start_logits, start_positions)
end_loss = loss_fct(end_logits, end_positions)
total_loss = (start_loss + end_loss) / 2
if not return_dict:
output = (
start_logits,
end_logits,
) + outputs[1:]
return ((total_loss,) + output) if total_loss is not None else output
return Seq2SeqQuestionAnsweringModelOutput(
loss=total_loss,
start_logits=start_logits,
end_logits=end_logits,
past_key_values=outputs.past_key_values,
decoder_hidden_states=outputs.decoder_hidden_states,
decoder_attentions=outputs.decoder_attentions,
cross_attentions=outputs.cross_attentions,
encoder_last_hidden_state=outputs.encoder_last_hidden_state,
encoder_hidden_states=outputs.encoder_hidden_states,
encoder_attentions=outputs.encoder_attentions,
)
# Copied from transformers.models.bart.modeling_bart.BartDecoderWrapper with Bart->MBart
class UnimerMBartDecoderWrapper(UnimerMBartPreTrainedModel):
"""
This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
used in combination with the [`EncoderDecoderModel`] framework.
"""
def __init__(self, config):
super().__init__(config)
self.decoder = UnimerMBartDecoder(config)
def forward(self, *args, **kwargs):
return self.decoder(*args, **kwargs)
# Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->MBart, facebook/bart-base->facebook/mbart-large-cc25
class UnimerMBartForCausalLM(UnimerMBartPreTrainedModel, GenerationMixin):
_tied_weights_keys = ["lm_head.weight"]
def __init__(self, config):
config = copy.deepcopy(config)
config.is_decoder = True
config.is_encoder_decoder = False
super().__init__(config)
self.model = UnimerMBartDecoderWrapper(config)
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
# Initialize weights and apply final processing
self.post_init()
def get_input_embeddings(self):
return self.model.decoder.embed_tokens
def set_input_embeddings(self, value):
self.model.decoder.embed_tokens = value
def get_output_embeddings(self):
return self.lm_head
def set_output_embeddings(self, new_embeddings):
self.lm_head = new_embeddings
def set_decoder(self, decoder):
self.model.decoder = decoder
def get_decoder(self):
return self.model.decoder
@replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentionsAndCounting, config_class=_CONFIG_FOR_DOC)
def forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
encoder_hidden_states: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
head_mask: Optional[torch.Tensor] = None,
cross_attn_head_mask: Optional[torch.Tensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
labels: Optional[torch.LongTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
count_gt: Optional[torch.LongTensor] = None,
) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
r"""
Args:
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
provide it.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
if the model is configured as a decoder.
encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional
tensors are only required when the model is used as a decoder in a Sequence to Sequence model.
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
use_cache (`bool`, *optional*):
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
(see `past_key_values`).
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
output_attentions (`bool`, *optional*):
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
returned tensors for more detail.
output_hidden_states (`bool`, *optional*):
Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
for more detail.
return_dict (`bool`, *optional*):
Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Returns:
Example:
```python
>>> from transformers import AutoTokenizer, MBartForCausalLM
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-cc25")
>>> model = MBartForCausalLM.from_pretrained("facebook/mbart-large-cc25", add_cross_attention=False)
>>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
>>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
>>> outputs = model(**inputs)
>>> logits = outputs.logits
>>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size]
>>> list(logits.shape) == expected_shape
True
```"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
count_pred = None
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
outputs = self.model.decoder(
input_ids=input_ids,
attention_mask=attention_mask,
count_pred=count_pred,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
head_mask=head_mask,
cross_attn_head_mask=cross_attn_head_mask,
past_key_values=past_key_values,
inputs_embeds=inputs_embeds,
use_cache=use_cache,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
logits = self.lm_head(outputs[0])
loss = None
if labels is not None:
labels = labels.to(logits.device)
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
if not return_dict:
output = (logits,) + outputs[1:]
return (loss,) + output if loss is not None else output
return CausalLMOutputWithCrossAttentionsAndCounting(
loss=loss,
logits=logits,
past_key_values=outputs.past_key_values,
hidden_states=outputs.hidden_states,
attentions=outputs.attentions,
cross_attentions=outputs.cross_attentions,
counting=count_pred,
)
def prepare_inputs_for_generation(
self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
):
# if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
if attention_mask is None:
attention_mask = input_ids.new_ones(input_ids.shape)
if past_key_values:
past_length = past_key_values[0][0].shape[2]
# Some generation methods already pass only the last input ID
if input_ids.shape[1] > past_length:
remove_prefix_length = past_length
else:
# Default to old behavior: keep only final ID
remove_prefix_length = input_ids.shape[1] - 1
input_ids = input_ids[:, remove_prefix_length:]
# first step, decoder_cached_states are empty
return {
"input_ids": input_ids, # encoder_outputs is defined. input_ids not needed
"attention_mask": attention_mask,
"past_key_values": past_key_values,
"use_cache": use_cache,
}
@staticmethod
def _reorder_cache(past_key_values, beam_idx):
reordered_past = ()
for layer_past in past_key_values:
reordered_past += (
tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
)
return reordered_past
from .configuration_unimer_swin import UnimerSwinConfig
from .modeling_unimer_swin import UnimerSwinModel
from .image_processing_unimer_swin import UnimerSwinImageProcessor
__all__ = [
"UnimerSwinConfig",
"UnimerSwinModel",
"UnimerSwinImageProcessor",
]
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment