refactor: rename init file and update app.py to enable parsing method

bd927919 · myhloli · f5016508 · f5016508 · f5016508 · f5016508
Commit bd927919 authored May 27, 2025 by myhloli
20 changed files
--- a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py
+++ b/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/configuration_layoutlmv3.py
-# coding=utf-8
-from transformers.models.bert.configuration_bert import BertConfig
-from transformers.utils import logging
-logger = logging.get_logger(__name__)
-LAYOUTLMV3_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "layoutlmv3-base": "https://huggingface.co/microsoft/layoutlmv3-base/resolve/main/config.json",
-    "layoutlmv3-large": "https://huggingface.co/microsoft/layoutlmv3-large/resolve/main/config.json",
-    # See all LayoutLMv3 models at https://huggingface.co/models?filter=layoutlmv3
-}
-class LayoutLMv3Config(BertConfig):
-    model_type = "layoutlmv3"
-    def __init__(
-        self,
-        pad_token_id=1,
-        bos_token_id=0,
-        eos_token_id=2,
-        max_2d_position_embeddings=1024,
-        coordinate_size=None,
-        shape_size=None,
-        has_relative_attention_bias=False,
-        rel_pos_bins=32,
-        max_rel_pos=128,
-        has_spatial_attention_bias=False,
-        rel_2d_pos_bins=64,
-        max_rel_2d_pos=256,
-        visual_embed=True,
-        mim=False,
-        wpa_task=False,
-        discrete_vae_weight_path='',
-        discrete_vae_type='dall-e',
-        input_size=224,
-        second_input_size=112,
-        device='cuda',
-        **kwargs
-    ):
-        """Constructs RobertaConfig."""
-        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
-        self.max_2d_position_embeddings = max_2d_position_embeddings
-        self.coordinate_size = coordinate_size
-        self.shape_size = shape_size
-        self.has_relative_attention_bias = has_relative_attention_bias
-        self.rel_pos_bins = rel_pos_bins
-        self.max_rel_pos = max_rel_pos
-        self.has_spatial_attention_bias = has_spatial_attention_bias
-        self.rel_2d_pos_bins = rel_2d_pos_bins
-        self.max_rel_2d_pos = max_rel_2d_pos
-        self.visual_embed = visual_embed
-        self.mim = mim
-        self.wpa_task = wpa_task
-        self.discrete_vae_weight_path = discrete_vae_weight_path
-        self.discrete_vae_type = discrete_vae_type
-        self.input_size = input_size
-        self.second_input_size = second_input_size
-        self.device = device
--- a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py
+++ b/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/modeling_layoutlmv3.py
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch LayoutLMv3 model. """
-import math
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from transformers import apply_chunking_to_forward
-from transformers.modeling_outputs import (
-    BaseModelOutputWithPastAndCrossAttentions,
-    BaseModelOutputWithPoolingAndCrossAttentions,
-    MaskedLMOutput,
-    TokenClassifierOutput,
-    QuestionAnsweringModelOutput,
-    SequenceClassifierOutput,
-)
-from transformers.modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
-from transformers.models.roberta.modeling_roberta import (
-    RobertaIntermediate,
-    RobertaLMHead,
-    RobertaOutput,
-    RobertaSelfOutput,
-)
-from transformers.utils import logging
-from .configuration_layoutlmv3 import LayoutLMv3Config
-from timm.models.layers import to_2tuple
-logger = logging.get_logger(__name__)
-class PatchEmbed(nn.Module):
-    """ Image to Patch Embedding
-    """
-    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
-        super().__init__()
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-        self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
-        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
-        # The following variables are used in detection mycheckpointer.py
-        self.num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
-        self.num_patches_w = self.patch_shape[0]
-        self.num_patches_h = self.patch_shape[1]
-    def forward(self, x, position_embedding=None):
-        x = self.proj(x)
-        if position_embedding is not None:
-            # interpolate the position embedding to the corresponding size
-            position_embedding = position_embedding.view(1, self.patch_shape[0], self.patch_shape[1], -1).permute(0, 3, 1, 2)
-            Hp, Wp = x.shape[2], x.shape[3]
-            position_embedding = F.interpolate(position_embedding, size=(Hp, Wp), mode='bicubic')
-            x = x + position_embedding
-        x = x.flatten(2).transpose(1, 2)
-        return x
-class LayoutLMv3Embeddings(nn.Module):
-    """
-    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
-    """
-    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
-    def __init__(self, config):
-        super().__init__()
-        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
-        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
-        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
-        self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)))
-        # End copy
-        self.padding_idx = config.pad_token_id
-        self.position_embeddings = nn.Embedding(
-            config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
-        )
-        self.x_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size)
-        self.y_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.coordinate_size)
-        self.h_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.shape_size)
-        self.w_position_embeddings = nn.Embedding(config.max_2d_position_embeddings, config.shape_size)
-    def _calc_spatial_position_embeddings(self, bbox):
-        try:
-            assert torch.all(0 <= bbox) and torch.all(bbox <= 1023)
-            left_position_embeddings = self.x_position_embeddings(bbox[:, :, 0])
-            upper_position_embeddings = self.y_position_embeddings(bbox[:, :, 1])
-            right_position_embeddings = self.x_position_embeddings(bbox[:, :, 2])
-            lower_position_embeddings = self.y_position_embeddings(bbox[:, :, 3])
-        except IndexError as e:
-            raise IndexError("The :obj:`bbox` coordinate values should be within 0-1000 range.") from e
-        h_position_embeddings = self.h_position_embeddings(torch.clip(bbox[:, :, 3] - bbox[:, :, 1], 0, 1023))
-        w_position_embeddings = self.w_position_embeddings(torch.clip(bbox[:, :, 2] - bbox[:, :, 0], 0, 1023))
-        # below is the difference between LayoutLMEmbeddingsV2 (torch.cat) and LayoutLMEmbeddingsV1 (add)
-        spatial_position_embeddings = torch.cat(
-            [
-                left_position_embeddings,
-                upper_position_embeddings,
-                right_position_embeddings,
-                lower_position_embeddings,
-                h_position_embeddings,
-                w_position_embeddings,
-            ],
-            dim=-1,
-        )
-        return spatial_position_embeddings
-    def create_position_ids_from_input_ids(self, input_ids, padding_idx, past_key_values_length=0):
-        """
-        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
-        are ignored. This is modified from fairseq's `utils.make_positions`.
-        Args:
-            x: torch.Tensor x:
-        Returns: torch.Tensor
-        """
-        # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
-        mask = input_ids.ne(padding_idx).int()
-        incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
-        return incremental_indices.long() + padding_idx
-    def forward(
-        self,
-        input_ids=None,
-        bbox=None,
-        token_type_ids=None,
-        position_ids=None,
-        inputs_embeds=None,
-        past_key_values_length=0,
-    ):
-        if position_ids is None:
-            if input_ids is not None:
-                # Create the position ids from the input token ids. Any padded tokens remain padded.
-                position_ids = self.create_position_ids_from_input_ids(
-                    input_ids, self.padding_idx, past_key_values_length).to(input_ids.device)
-            else:
-                position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
-        if input_ids is not None:
-            input_shape = input_ids.size()
-        else:
-            input_shape = inputs_embeds.size()[:-1]
-        if token_type_ids is None:
-            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
-        if inputs_embeds is None:
-            inputs_embeds = self.word_embeddings(input_ids)
-        token_type_embeddings = self.token_type_embeddings(token_type_ids)
-        embeddings = inputs_embeds + token_type_embeddings
-        position_embeddings = self.position_embeddings(position_ids)
-        embeddings += position_embeddings
-        spatial_position_embeddings = self._calc_spatial_position_embeddings(bbox)
-        embeddings = embeddings + spatial_position_embeddings
-        embeddings = self.LayerNorm(embeddings)
-        embeddings = self.dropout(embeddings)
-        return embeddings
-    def create_position_ids_from_inputs_embeds(self, inputs_embeds):
-        """
-        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
-        Args:
-            inputs_embeds: torch.Tensor≈
-        Returns: torch.Tensor
-        """
-        input_shape = inputs_embeds.size()[:-1]
-        sequence_length = input_shape[1]
-        position_ids = torch.arange(
-            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
-        )
-        return position_ids.unsqueeze(0).expand(input_shape)
-class LayoutLMv3PreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-    config_class = LayoutLMv3Config
-    base_model_prefix = "layoutlmv3"
-    # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
-    def _init_weights(self, module):
-        """Initialize the weights"""
-        if isinstance(module, nn.Linear):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-class LayoutLMv3SelfAttention(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
-            raise ValueError(
-                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
-                f"heads ({config.num_attention_heads})"
-            )
-        self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-        self.query = nn.Linear(config.hidden_size, self.all_head_size)
-        self.key = nn.Linear(config.hidden_size, self.all_head_size)
-        self.value = nn.Linear(config.hidden_size, self.all_head_size)
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-        self.has_relative_attention_bias = config.has_relative_attention_bias
-        self.has_spatial_attention_bias = config.has_spatial_attention_bias
-    def transpose_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
-        x = x.view(*new_x_shape)
-        return x.permute(0, 2, 1, 3)
-    def cogview_attn(self, attention_scores, alpha=32):
-        '''
-        https://arxiv.org/pdf/2105.13290.pdf
-        Section 2.4 Stabilization of training: Precision Bottleneck Relaxation (PB-Relax).
-        A replacement of the original nn.Softmax(dim=-1)(attention_scores)
-        Seems the new attention_probs will result in a slower speed and a little bias
-        Can use torch.allclose(standard_attention_probs, cogview_attention_probs, atol=1e-08) for comparison
-        The smaller atol (e.g., 1e-08), the better.
-        '''
-        scaled_attention_scores = attention_scores / alpha
-        max_value = scaled_attention_scores.amax(dim=(-1)).unsqueeze(-1)
-        # max_value = scaled_attention_scores.amax(dim=(-2, -1)).unsqueeze(-1).unsqueeze(-1)
-        new_attention_scores = (scaled_attention_scores - max_value) * alpha
-        return nn.Softmax(dim=-1)(new_attention_scores)
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-        rel_pos=None,
-        rel_2d_pos=None,
-    ):
-        mixed_query_layer = self.query(hidden_states)
-        # If this is instantiated as a cross-attention module, the keys
-        # and values come from an encoder; the attention mask needs to be
-        # such that the encoder's padding tokens are not attended to.
-        is_cross_attention = encoder_hidden_states is not None
-        if is_cross_attention and past_key_value is not None:
-            # reuse k,v, cross_attentions
-            key_layer = past_key_value[0]
-            value_layer = past_key_value[1]
-            attention_mask = encoder_attention_mask
-        elif is_cross_attention:
-            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
-            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
-            attention_mask = encoder_attention_mask
-        elif past_key_value is not None:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-            key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
-            value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
-        else:
-            key_layer = self.transpose_for_scores(self.key(hidden_states))
-            value_layer = self.transpose_for_scores(self.value(hidden_states))
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        # The attention scores QT K/√d could be significantly larger than input elements, and result in overflow.
-        # Changing the computational order into QT(K/√d) alleviates the problem. (https://arxiv.org/pdf/2105.13290.pdf)
-        attention_scores = torch.matmul(query_layer / math.sqrt(self.attention_head_size), key_layer.transpose(-1, -2))
-        if self.has_relative_attention_bias and self.has_spatial_attention_bias:
-            attention_scores += (rel_pos + rel_2d_pos) / math.sqrt(self.attention_head_size)
-        elif self.has_relative_attention_bias:
-            attention_scores += rel_pos / math.sqrt(self.attention_head_size)
-        # if self.has_relative_attention_bias:
-        #     attention_scores += rel_pos
-        # if self.has_spatial_attention_bias:
-        #     attention_scores += rel_2d_pos
-        # attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-        if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in RobertaModel forward() function)
-            attention_scores = attention_scores + attention_mask
-        # Normalize the attention scores to probabilities.
-        # attention_probs = nn.Softmax(dim=-1)(attention_scores)  # comment the line below and use this line for speedup
-        attention_probs = self.cogview_attn(attention_scores)  # to stablize training
-        # assert torch.allclose(attention_probs, nn.Softmax(dim=-1)(attention_scores), atol=1e-8)
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-        context_layer = torch.matmul(attention_probs, value_layer)
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(*new_context_layer_shape)
-        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
-        return outputs
-class LayoutLMv3Attention(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.self = LayoutLMv3SelfAttention(config)
-        self.output = RobertaSelfOutput(config)
-        self.pruned_heads = set()
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        heads, index = find_pruneable_heads_and_indices(
-            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
-        )
-        # Prune linear layers
-        self.self.query = prune_linear_layer(self.self.query, index)
-        self.self.key = prune_linear_layer(self.self.key, index)
-        self.self.value = prune_linear_layer(self.self.value, index)
-        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
-        # Update hyper params and store pruned heads
-        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
-        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
-        self.pruned_heads = self.pruned_heads.union(heads)
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-        rel_pos=None,
-        rel_2d_pos=None,
-    ):
-        self_outputs = self.self(
-            hidden_states,
-            attention_mask,
-            head_mask,
-            encoder_hidden_states,
-            encoder_attention_mask,
-            past_key_value,
-            output_attentions,
-            rel_pos=rel_pos,
-            rel_2d_pos=rel_2d_pos,
-        )
-        attention_output = self.output(self_outputs[0], hidden_states)
-        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
-        return outputs
-class LayoutLMv3Layer(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.chunk_size_feed_forward = config.chunk_size_feed_forward
-        self.seq_len_dim = 1
-        self.attention = LayoutLMv3Attention(config)
-        assert not config.is_decoder and not config.add_cross_attention, \
-            "This version do not support decoder. Please refer to RoBERTa for implementation of is_decoder."
-        self.intermediate = RobertaIntermediate(config)
-        self.output = RobertaOutput(config)
-    def forward(
-        self,
-        hidden_states,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_value=None,
-        output_attentions=False,
-        rel_pos=None,
-        rel_2d_pos=None,
-    ):
-        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
-        self_attention_outputs = self.attention(
-            hidden_states,
-            attention_mask,
-            head_mask,
-            output_attentions=output_attentions,
-            past_key_value=self_attn_past_key_value,
-            rel_pos=rel_pos,
-            rel_2d_pos=rel_2d_pos,
-        )
-        attention_output = self_attention_outputs[0]
-        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
-        layer_output = apply_chunking_to_forward(
-            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
-        )
-        outputs = (layer_output,) + outputs
-        return outputs
-    def feed_forward_chunk(self, attention_output):
-        intermediate_output = self.intermediate(attention_output)
-        layer_output = self.output(intermediate_output, attention_output)
-        return layer_output
-class LayoutLMv3Encoder(nn.Module):
-    def __init__(self, config, detection=False, out_features=None):
-        super().__init__()
-        self.config = config
-        self.detection = detection
-        self.layer = nn.ModuleList([LayoutLMv3Layer(config) for _ in range(config.num_hidden_layers)])
-        self.gradient_checkpointing = False
-        self.has_relative_attention_bias = config.has_relative_attention_bias
-        self.has_spatial_attention_bias = config.has_spatial_attention_bias
-        if self.has_relative_attention_bias:
-            self.rel_pos_bins = config.rel_pos_bins
-            self.max_rel_pos = config.max_rel_pos
-            self.rel_pos_onehot_size = config.rel_pos_bins
-            self.rel_pos_bias = nn.Linear(self.rel_pos_onehot_size, config.num_attention_heads, bias=False)
-        if self.has_spatial_attention_bias:
-            self.max_rel_2d_pos = config.max_rel_2d_pos
-            self.rel_2d_pos_bins = config.rel_2d_pos_bins
-            self.rel_2d_pos_onehot_size = config.rel_2d_pos_bins
-            self.rel_pos_x_bias = nn.Linear(self.rel_2d_pos_onehot_size, config.num_attention_heads, bias=False)
-            self.rel_pos_y_bias = nn.Linear(self.rel_2d_pos_onehot_size, config.num_attention_heads, bias=False)
-        if self.detection:
-            self.gradient_checkpointing = True
-            embed_dim = self.config.hidden_size
-            self.out_features = out_features
-            self.out_indices = [int(name[5:]) for name in out_features]
-            self.fpn1 = nn.Sequential(
-                nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
-                # nn.SyncBatchNorm(embed_dim),
-                nn.BatchNorm2d(embed_dim),
-                nn.GELU(),
-                nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
-            )
-            self.fpn2 = nn.Sequential(
-                nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
-            )
-            self.fpn3 = nn.Identity()
-            self.fpn4 = nn.MaxPool2d(kernel_size=2, stride=2)
-            self.ops = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
-    def relative_position_bucket(self, relative_position, bidirectional=True, num_buckets=32, max_distance=128):
-        ret = 0
-        if bidirectional:
-            num_buckets //= 2
-            ret += (relative_position > 0).long() * num_buckets
-            n = torch.abs(relative_position)
-        else:
-            n = torch.max(-relative_position, torch.zeros_like(relative_position))
-        # now n is in the range [0, inf)
-        # half of the buckets are for exact increments in positions
-        max_exact = num_buckets // 2
-        is_small = n < max_exact
-        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
-        val_if_large = max_exact + (
-                torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
-        ).to(torch.long)
-        val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
-        ret += torch.where(is_small, n, val_if_large)
-        return ret
-    def _cal_1d_pos_emb(self, hidden_states, position_ids, valid_span):
-        VISUAL_NUM = 196 + 1
-        rel_pos_mat = position_ids.unsqueeze(-2) - position_ids.unsqueeze(-1)
-        if valid_span is not None:
-            # for the text part, if two words are not in the same line,
-            # set their distance to the max value (position_ids.shape[-1])
-            rel_pos_mat[(rel_pos_mat > 0) & (valid_span == False)] = position_ids.shape[1]
-            rel_pos_mat[(rel_pos_mat < 0) & (valid_span == False)] = -position_ids.shape[1]
-            # image-text, minimum distance
-            rel_pos_mat[:, -VISUAL_NUM:, :-VISUAL_NUM] = 0
-            rel_pos_mat[:, :-VISUAL_NUM, -VISUAL_NUM:] = 0
-        rel_pos = self.relative_position_bucket(
-            rel_pos_mat,
-            num_buckets=self.rel_pos_bins,
-            max_distance=self.max_rel_pos,
-        )
-        rel_pos = F.one_hot(rel_pos, num_classes=self.rel_pos_onehot_size).type_as(hidden_states)
-        rel_pos = self.rel_pos_bias(rel_pos).permute(0, 3, 1, 2)
-        rel_pos = rel_pos.contiguous()
-        return rel_pos
-    def _cal_2d_pos_emb(self, hidden_states, bbox):
-        position_coord_x = bbox[:, :, 0]
-        position_coord_y = bbox[:, :, 3]
-        rel_pos_x_2d_mat = position_coord_x.unsqueeze(-2) - position_coord_x.unsqueeze(-1)
-        rel_pos_y_2d_mat = position_coord_y.unsqueeze(-2) - position_coord_y.unsqueeze(-1)
-        rel_pos_x = self.relative_position_bucket(
-            rel_pos_x_2d_mat,
-            num_buckets=self.rel_2d_pos_bins,
-            max_distance=self.max_rel_2d_pos,
-        )
-        rel_pos_y = self.relative_position_bucket(
-            rel_pos_y_2d_mat,
-            num_buckets=self.rel_2d_pos_bins,
-            max_distance=self.max_rel_2d_pos,
-        )
-        rel_pos_x = F.one_hot(rel_pos_x, num_classes=self.rel_2d_pos_onehot_size).type_as(hidden_states)
-        rel_pos_y = F.one_hot(rel_pos_y, num_classes=self.rel_2d_pos_onehot_size).type_as(hidden_states)
-        rel_pos_x = self.rel_pos_x_bias(rel_pos_x).permute(0, 3, 1, 2)
-        rel_pos_y = self.rel_pos_y_bias(rel_pos_y).permute(0, 3, 1, 2)
-        rel_pos_x = rel_pos_x.contiguous()
-        rel_pos_y = rel_pos_y.contiguous()
-        rel_2d_pos = rel_pos_x + rel_pos_y
-        return rel_2d_pos
-    def forward(
-        self,
-        hidden_states,
-        bbox=None,
-        attention_mask=None,
-        head_mask=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=False,
-        output_hidden_states=False,
-        return_dict=True,
-        position_ids=None,
-        Hp=None,
-        Wp=None,
-        valid_span=None,
-    ):
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attentions = () if output_attentions else None
-        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
-        next_decoder_cache = () if use_cache else None
-        rel_pos = self._cal_1d_pos_emb(hidden_states, position_ids, valid_span) if self.has_relative_attention_bias else None
-        rel_2d_pos = self._cal_2d_pos_emb(hidden_states, bbox) if self.has_spatial_attention_bias else None
-        if self.detection:
-            feat_out = {}
-            j = 0
-        for i, layer_module in enumerate(self.layer):
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-            layer_head_mask = head_mask[i] if head_mask is not None else None
-            past_key_value = past_key_values[i] if past_key_values is not None else None
-            if self.gradient_checkpointing and self.training:
-                if use_cache:
-                    logger.warning(
-                        "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
-                    )
-                    use_cache = False
-                def create_custom_forward(module):
-                    def custom_forward(*inputs):
-                        return module(*inputs)
-                        # return module(*inputs, past_key_value, output_attentions, rel_pos, rel_2d_pos)
-                        # The above line will cause error:
-                        # RuntimeError: Trying to backward through the graph a second time
-                        # (or directly access saved tensors after they have already been freed).
-                    return custom_forward
-                layer_outputs = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(layer_module),
-                    hidden_states,
-                    attention_mask,
-                    layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                    past_key_value,
-                    output_attentions,
-                    rel_pos,
-                    rel_2d_pos
-                )
-            else:
-                layer_outputs = layer_module(
-                    hidden_states,
-                    attention_mask,
-                    layer_head_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                    past_key_value,
-                    output_attentions,
-                    rel_pos=rel_pos,
-                    rel_2d_pos=rel_2d_pos,
-                )
-            hidden_states = layer_outputs[0]
-            if use_cache:
-                next_decoder_cache += (layer_outputs[-1],)
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (layer_outputs[1],)
-                if self.config.add_cross_attention:
-                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
-            if self.detection and i in self.out_indices:
-                xp = hidden_states[:, -Hp*Wp:, :].permute(0, 2, 1).reshape(len(hidden_states), -1, Hp, Wp)
-                feat_out[self.out_features[j]] = self.ops[j](xp.contiguous())
-                j += 1
-        if self.detection:
-            return feat_out
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-        if not return_dict:
-            return tuple(
-                v
-                for v in [
-                    hidden_states,
-                    next_decoder_cache,
-                    all_hidden_states,
-                    all_self_attentions,
-                    all_cross_attentions,
-                ]
-                if v is not None
-            )
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=next_decoder_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-            cross_attentions=all_cross_attentions,
-        )
-class LayoutLMv3Model(LayoutLMv3PreTrainedModel):
-    """
-    """
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-    # Copied from transformers.models.bert.modeling_bert.BertModel.__init__ with Bert->Roberta
-    def __init__(self, config, detection=False, out_features=None, image_only=False):
-        super().__init__(config)
-        self.config = config
-        assert not config.is_decoder and not config.add_cross_attention, \
-            "This version do not support decoder. Please refer to RoBERTa for implementation of is_decoder."
-        self.detection = detection
-        if not self.detection:
-            self.image_only = False
-        else:
-            assert config.visual_embed
-            self.image_only = image_only
-        if not self.image_only:
-            self.embeddings = LayoutLMv3Embeddings(config)
-        self.encoder = LayoutLMv3Encoder(config, detection=detection, out_features=out_features)
-        if config.visual_embed:
-            embed_dim = self.config.hidden_size
-            # use the default pre-training parameters for fine-tuning (e.g., input_size)
-            # when the input_size is larger in fine-tuning, we will interpolate the position embedding in forward
-            self.patch_embed = PatchEmbed(embed_dim=embed_dim)
-            patch_size = 16
-            size = int(self.config.input_size / patch_size)
-            self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
-            self.pos_embed = nn.Parameter(torch.zeros(1, size * size + 1, embed_dim))
-            self.pos_drop = nn.Dropout(p=0.)
-            self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-            self.dropout = nn.Dropout(config.hidden_dropout_prob)
-            if self.config.has_relative_attention_bias or self.config.has_spatial_attention_bias:
-                self._init_visual_bbox(img_size=(size, size))
-            from functools import partial
-            norm_layer = partial(nn.LayerNorm, eps=1e-6)
-            self.norm = norm_layer(embed_dim)
-        self.init_weights()
-    def get_input_embeddings(self):
-        return self.embeddings.word_embeddings
-    def set_input_embeddings(self, value):
-        self.embeddings.word_embeddings = value
-    def _prune_heads(self, heads_to_prune):
-        """
-        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
-        class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
-    def _init_visual_bbox(self, img_size=(14, 14), max_len=1000):
-        visual_bbox_x = torch.div(torch.arange(0, max_len * (img_size[1] + 1), max_len),
-                                  img_size[1], rounding_mode='trunc')
-        visual_bbox_y = torch.div(torch.arange(0, max_len * (img_size[0] + 1), max_len),
-                                  img_size[0], rounding_mode='trunc')
-        visual_bbox = torch.stack(
-            [
-                visual_bbox_x[:-1].repeat(img_size[0], 1),
-                visual_bbox_y[:-1].repeat(img_size[1], 1).transpose(0, 1),
-                visual_bbox_x[1:].repeat(img_size[0], 1),
-                visual_bbox_y[1:].repeat(img_size[1], 1).transpose(0, 1),
-            ],
-            dim=-1,
-        ).view(-1, 4)
-        cls_token_box = torch.tensor([[0 + 1, 0 + 1, max_len - 1, max_len - 1]])
-        self.visual_bbox = torch.cat([cls_token_box, visual_bbox], dim=0)
-    def _calc_visual_bbox(self, device, dtype, bsz):  # , img_size=(14, 14), max_len=1000):
-        visual_bbox = self.visual_bbox.repeat(bsz, 1, 1)
-        visual_bbox = visual_bbox.to(device).type(dtype)
-        return visual_bbox
-    def forward_image(self, x):
-        if self.detection:
-            x = self.patch_embed(x, self.pos_embed[:, 1:, :] if self.pos_embed is not None else None)
-        else:
-            x = self.patch_embed(x)
-        batch_size, seq_len, _ = x.size()
-        cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
-        if self.pos_embed is not None and self.detection:
-            cls_tokens = cls_tokens + self.pos_embed[:, :1, :]
-        x = torch.cat((cls_tokens, x), dim=1)
-        if self.pos_embed is not None and not self.detection:
-            x = x + self.pos_embed
-        x = self.pos_drop(x)
-        x = self.norm(x)
-        return x
-    # Copied from transformers.models.bert.modeling_bert.BertModel.forward
-    def forward(
-        self,
-        input_ids=None,
-        bbox=None,
-        attention_mask=None,
-        token_type_ids=None,
-        valid_span=None,
-        position_ids=None,
-        head_mask=None,
-        inputs_embeds=None,
-        encoder_hidden_states=None,
-        encoder_attention_mask=None,
-        past_key_values=None,
-        use_cache=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        images=None,
-    ):
-        r"""
-        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`):
-            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
-            the model is configured as a decoder.
-        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
-            the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
-            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
-            If :obj:`past_key_values` are used, the user can optionally input only the last :obj:`decoder_input_ids`
-            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
-            instead of all :obj:`decoder_input_ids` of shape :obj:`(batch_size, sequence_length)`.
-        use_cache (:obj:`bool`, `optional`):
-            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
-            decoding (see :obj:`past_key_values`).
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        use_cache = False
-        # if input_ids is not None and inputs_embeds is not None:
-        #     raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        if input_ids is not None:
-            input_shape = input_ids.size()
-            batch_size, seq_length = input_shape
-            device = input_ids.device
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-            batch_size, seq_length = input_shape
-            device = inputs_embeds.device
-        elif images is not None:
-            batch_size = len(images)
-            device = images.device
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds or images")
-        if not self.image_only:
-            # past_key_values_length
-            past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
-            if attention_mask is None:
-                attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
-            if token_type_ids is None:
-                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
-        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just need to make it broadcastable to all heads.
-        # extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
-        encoder_extended_attention_mask = None
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-        if not self.image_only:
-            if bbox is None:
-                bbox = torch.zeros(tuple(list(input_shape) + [4]), dtype=torch.long, device=device)
-            embedding_output = self.embeddings(
-                input_ids=input_ids,
-                bbox=bbox,
-                position_ids=position_ids,
-                token_type_ids=token_type_ids,
-                inputs_embeds=inputs_embeds,
-                past_key_values_length=past_key_values_length,
-            )
-        final_bbox = final_position_ids = None
-        Hp = Wp = None
-        if images is not None:
-            patch_size = 16
-            Hp, Wp = int(images.shape[2] / patch_size), int(images.shape[3] / patch_size)
-            visual_emb = self.forward_image(images)
-            if self.detection:
-                visual_attention_mask = torch.ones((batch_size, visual_emb.shape[1]), dtype=torch.long, device=device)
-                if self.image_only:
-                    attention_mask = visual_attention_mask
-                else:
-                    attention_mask = torch.cat([attention_mask, visual_attention_mask], dim=1)
-            elif self.image_only:
-                attention_mask = torch.ones((batch_size, visual_emb.shape[1]), dtype=torch.long, device=device)
-            if self.config.has_relative_attention_bias or self.config.has_spatial_attention_bias:
-                if self.config.has_spatial_attention_bias:
-                    visual_bbox = self._calc_visual_bbox(device, dtype=torch.long, bsz=batch_size)
-                    if self.image_only:
-                        final_bbox = visual_bbox
-                    else:
-                        final_bbox = torch.cat([bbox, visual_bbox], dim=1)
-                visual_position_ids = torch.arange(0, visual_emb.shape[1], dtype=torch.long, device=device).repeat(
-                    batch_size, 1)
-                if self.image_only:
-                    final_position_ids = visual_position_ids
-                else:
-                    position_ids = torch.arange(0, input_shape[1], device=device).unsqueeze(0)
-                    position_ids = position_ids.expand_as(input_ids)
-                    final_position_ids = torch.cat([position_ids, visual_position_ids], dim=1)
-            if self.image_only:
-                embedding_output = visual_emb
-            else:
-                embedding_output = torch.cat([embedding_output, visual_emb], dim=1)
-            embedding_output = self.LayerNorm(embedding_output)
-            embedding_output = self.dropout(embedding_output)
-        elif self.config.has_relative_attention_bias or self.config.has_spatial_attention_bias:
-            if self.config.has_spatial_attention_bias:
-                final_bbox = bbox
-            if self.config.has_relative_attention_bias:
-                position_ids = self.embeddings.position_ids[:, :input_shape[1]]
-                position_ids = position_ids.expand_as(input_ids)
-                final_position_ids = position_ids
-        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, None, device)
-        encoder_outputs = self.encoder(
-            embedding_output,
-            bbox=final_bbox,
-            position_ids=final_position_ids,
-            attention_mask=extended_attention_mask,
-            head_mask=head_mask,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_extended_attention_mask,
-            past_key_values=past_key_values,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            Hp=Hp,
-            Wp=Wp,
-            valid_span=valid_span,
-        )
-        if self.detection:
-            return encoder_outputs
-        sequence_output = encoder_outputs[0]
-        pooled_output = None
-        if not return_dict:
-            return (sequence_output, pooled_output) + encoder_outputs[1:]
-        return BaseModelOutputWithPoolingAndCrossAttentions(
-            last_hidden_state=sequence_output,
-            pooler_output=pooled_output,
-            past_key_values=encoder_outputs.past_key_values,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-            cross_attentions=encoder_outputs.cross_attentions,
-        )
-class LayoutLMv3ClassificationHead(nn.Module):
-    """
-    Head for sentence-level classification tasks.
-    Reference: RobertaClassificationHead
-    """
-    def __init__(self, config, pool_feature=False):
-        super().__init__()
-        self.pool_feature = pool_feature
-        if pool_feature:
-            self.dense = nn.Linear(config.hidden_size*3, config.hidden_size)
-        else:
-            self.dense = nn.Linear(config.hidden_size, config.hidden_size)
-        classifier_dropout = (
-            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
-        )
-        self.dropout = nn.Dropout(classifier_dropout)
-        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
-    def forward(self, x):
-        # x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
-        x = self.dropout(x)
-        x = self.dense(x)
-        x = torch.tanh(x)
-        x = self.dropout(x)
-        x = self.out_proj(x)
-        return x
-class LayoutLMv3ForTokenClassification(LayoutLMv3PreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.layoutlmv3 = LayoutLMv3Model(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        if config.num_labels < 10:
-            self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-        else:
-            self.classifier = LayoutLMv3ClassificationHead(config, pool_feature=False)
-        self.init_weights()
-    def forward(
-        self,
-        input_ids=None,
-        bbox=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        valid_span=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        images=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
-            Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
-            1]``.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        outputs = self.layoutlmv3(
-            input_ids,
-            bbox=bbox,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            images=images,
-            valid_span=valid_span,
-        )
-        sequence_output = outputs[0]
-        sequence_output = self.dropout(sequence_output)
-        logits = self.classifier(sequence_output)
-        loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            # Only keep active parts of the loss
-            if attention_mask is not None:
-                active_loss = attention_mask.view(-1) == 1
-                active_logits = logits.view(-1, self.num_labels)
-                active_labels = torch.where(
-                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
-                )
-                loss = loss_fct(active_logits, active_labels)
-            else:
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-        if not return_dict:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-        return TokenClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-class LayoutLMv3ForQuestionAnswering(LayoutLMv3PreTrainedModel):
-    _keys_to_ignore_on_load_unexpected = [r"pooler"]
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.layoutlmv3 = LayoutLMv3Model(config)
-        # self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
-        self.qa_outputs = LayoutLMv3ClassificationHead(config, pool_feature=False)
-        self.init_weights()
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        valid_span=None,
-        head_mask=None,
-        inputs_embeds=None,
-        start_positions=None,
-        end_positions=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        bbox=None,
-        images=None,
-    ):
-        r"""
-        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
-            sequence are not taken into account for computing the loss.
-        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
-            sequence are not taken into account for computing the loss.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        outputs = self.layoutlmv3(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            bbox=bbox,
-            images=images,
-            valid_span=valid_span,
-        )
-        sequence_output = outputs[0]
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1).contiguous()
-        end_logits = end_logits.squeeze(-1).contiguous()
-        total_loss = None
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions = start_positions.clamp(0, ignored_index)
-            end_positions = end_positions.clamp(0, ignored_index)
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-        if not return_dict:
-            output = (start_logits, end_logits) + outputs[2:]
-            return ((total_loss,) + output) if total_loss is not None else output
-        return QuestionAnsweringModelOutput(
-            loss=total_loss,
-            start_logits=start_logits,
-            end_logits=end_logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
-class LayoutLMv3ForSequenceClassification(LayoutLMv3PreTrainedModel):
-    _keys_to_ignore_on_load_missing = [r"position_ids"]
-    def __init__(self, config):
-        super().__init__(config)
-        self.num_labels = config.num_labels
-        self.config = config
-        self.layoutlmv3 = LayoutLMv3Model(config)
-        self.classifier = LayoutLMv3ClassificationHead(config, pool_feature=False)
-        self.init_weights()
-    def forward(
-        self,
-        input_ids=None,
-        attention_mask=None,
-        token_type_ids=None,
-        position_ids=None,
-        valid_span=None,
-        head_mask=None,
-        inputs_embeds=None,
-        labels=None,
-        output_attentions=None,
-        output_hidden_states=None,
-        return_dict=None,
-        bbox=None,
-        images=None,
-    ):
-        r"""
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
-            config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
-            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        outputs = self.layoutlmv3(
-            input_ids,
-            attention_mask=attention_mask,
-            token_type_ids=token_type_ids,
-            position_ids=position_ids,
-            head_mask=head_mask,
-            inputs_embeds=inputs_embeds,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-            bbox=bbox,
-            images=images,
-            valid_span=valid_span,
-        )
-        sequence_output = outputs[0][:, 0, :]
-        logits = self.classifier(sequence_output)
-        loss = None
-        if labels is not None:
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(logits, labels)
-        if not return_dict:
-            output = (logits,) + outputs[2:]
-            return ((loss,) + output) if loss is not None else output
-        return SequenceClassifierOutput(
-            loss=loss,
-            logits=logits,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-        )
--- a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py
+++ b/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3.py
-# coding=utf-8
-# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for LayoutLMv3, refer to RoBERTa."""
-from transformers.models.roberta import RobertaTokenizer
-from transformers.utils import logging
-logger = logging.get_logger(__name__)
-VOCAB_FILES_NAMES = {
-    "vocab_file": "vocab.json",
-    "merges_file": "merges.txt",
-}
-class LayoutLMv3Tokenizer(RobertaTokenizer):
-    vocab_files_names = VOCAB_FILES_NAMES
-    # pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    # max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    model_input_names = ["input_ids", "attention_mask"]
--- a/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py
+++ b/magic_pdf/model/sub_modules/layout/layoutlmv3/layoutlmft/models/layoutlmv3/tokenization_layoutlmv3_fast.py
-# coding=utf-8
-# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Fast Tokenization classes for LayoutLMv3, refer to RoBERTa."""
-from transformers.models.roberta.tokenization_roberta_fast import RobertaTokenizerFast
-from transformers.utils import logging
-from .tokenization_layoutlmv3 import LayoutLMv3Tokenizer
-logger = logging.get_logger(__name__)
-VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
-class LayoutLMv3TokenizerFast(RobertaTokenizerFast):
-    vocab_files_names = VOCAB_FILES_NAMES
-    # pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    # max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    model_input_names = ["input_ids", "attention_mask"]
-    slow_tokenizer_class = LayoutLMv3Tokenizer
--- a/magic_pdf/model/sub_modules/layout/layoutlmv3/model_init.py
+++ b/magic_pdf/model/sub_modules/layout/layoutlmv3/model_init.py
-from .visualizer import Visualizer
-from .rcnn_vl import *
-from .backbone import *
-from detectron2.config import get_cfg
-from detectron2.config import CfgNode as CN
-from detectron2.data import MetadataCatalog, DatasetCatalog
-from detectron2.data.datasets import register_coco_instances
-from detectron2.engine import DefaultTrainer, default_argument_parser, default_setup, launch, DefaultPredictor
-def add_vit_config(cfg):
-    """
-    Add config for VIT.
-    """
-    _C = cfg
-    _C.MODEL.VIT = CN()
-    # CoaT model name.
-    _C.MODEL.VIT.NAME = ""
-    # Output features from CoaT backbone.
-    _C.MODEL.VIT.OUT_FEATURES = ["layer3", "layer5", "layer7", "layer11"]
-    _C.MODEL.VIT.IMG_SIZE = [224, 224]
-    _C.MODEL.VIT.POS_TYPE = "shared_rel"
-    _C.MODEL.VIT.DROP_PATH = 0.
-    _C.MODEL.VIT.MODEL_KWARGS = "{}"
-    _C.SOLVER.OPTIMIZER = "ADAMW"
-    _C.SOLVER.BACKBONE_MULTIPLIER = 1.0
-    _C.AUG = CN()
-    _C.AUG.DETR = False
-    _C.MODEL.IMAGE_ONLY = True
-    _C.PUBLAYNET_DATA_DIR_TRAIN = ""
-    _C.PUBLAYNET_DATA_DIR_TEST = ""
-    _C.FOOTNOTE_DATA_DIR_TRAIN = ""
-    _C.FOOTNOTE_DATA_DIR_VAL = ""
-    _C.SCIHUB_DATA_DIR_TRAIN = ""
-    _C.SCIHUB_DATA_DIR_TEST = ""
-    _C.JIAOCAI_DATA_DIR_TRAIN = ""
-    _C.JIAOCAI_DATA_DIR_TEST = ""
-    _C.ICDAR_DATA_DIR_TRAIN = ""
-    _C.ICDAR_DATA_DIR_TEST = ""
-    _C.M6DOC_DATA_DIR_TEST = ""
-    _C.DOCSTRUCTBENCH_DATA_DIR_TEST = ""
-    _C.DOCSTRUCTBENCHv2_DATA_DIR_TEST = ""
-    _C.CACHE_DIR = ""
-    _C.MODEL.CONFIG_PATH = ""
-    # effective update steps would be MAX_ITER/GRADIENT_ACCUMULATION_STEPS
-    # maybe need to set MAX_ITER *= GRADIENT_ACCUMULATION_STEPS
-    _C.SOLVER.GRADIENT_ACCUMULATION_STEPS = 1
-def setup(args, device):
-    """
-    Create configs and perform basic setups.
-    """
-    cfg = get_cfg()
-    # add_coat_config(cfg)
-    add_vit_config(cfg)
-    cfg.merge_from_file(args.config_file)
-    cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.2  # set threshold for this model
-    cfg.merge_from_list(args.opts)
-    # 使用统一的device配置
-    cfg.MODEL.DEVICE = device
-    cfg.freeze()
-    default_setup(cfg, args)
-    #@todo 可以删掉这块？
-    # register_coco_instances(
-    #     "scihub_train",
-    #     {},
-    #     cfg.SCIHUB_DATA_DIR_TRAIN + ".json",
-    #     cfg.SCIHUB_DATA_DIR_TRAIN
-    # )
-    return cfg
-class DotDict(dict):
-    def __init__(self, *args, **kwargs):
-        super(DotDict, self).__init__(*args, **kwargs)
-    def __getattr__(self, key):
-        if key not in self.keys():
-            return None
-        value = self[key]
-        if isinstance(value, dict):
-            value = DotDict(value)
-        return value
-    def __setattr__(self, key, value):
-        self[key] = value
-class Layoutlmv3_Predictor(object):
-    def __init__(self, weights, config_file, device):
-        layout_args = {
-            "config_file": config_file,
-            "resume": False,
-            "eval_only": False,
-            "num_gpus": 1,
-            "num_machines": 1,
-            "machine_rank": 0,
-            "dist_url": "tcp://127.0.0.1:57823",
-            "opts": ["MODEL.WEIGHTS", weights],
-        }
-        layout_args = DotDict(layout_args)
-        cfg = setup(layout_args, device)
-        self.mapping = ["title", "plain text", "abandon", "figure", "figure_caption", "table", "table_caption",
-                        "table_footnote", "isolate_formula", "formula_caption"]
-        MetadataCatalog.get(cfg.DATASETS.TRAIN[0]).thing_classes = self.mapping
-        self.predictor = DefaultPredictor(cfg)
-    def __call__(self, image, ignore_catids=[]):
-        # page_layout_result = {
-        #     "layout_dets": []
-        # }
-        layout_dets = []
-        outputs = self.predictor(image)
-        boxes = outputs["instances"].to("cpu")._fields["pred_boxes"].tensor.tolist()
-        labels = outputs["instances"].to("cpu")._fields["pred_classes"].tolist()
-        scores = outputs["instances"].to("cpu")._fields["scores"].tolist()
-        for bbox_idx in range(len(boxes)):
-            if labels[bbox_idx] in ignore_catids:
-                continue
-            layout_dets.append({
-                "category_id": labels[bbox_idx],
-                "poly": [
-                    boxes[bbox_idx][0], boxes[bbox_idx][1],
-                    boxes[bbox_idx][2], boxes[bbox_idx][1],
-                    boxes[bbox_idx][2], boxes[bbox_idx][3],
-                    boxes[bbox_idx][0], boxes[bbox_idx][3],
-                ],
-                "score": scores[bbox_idx]
-            })
-        return layout_dets
--- a/magic_pdf/model/sub_modules/layout/layoutlmv3/rcnn_vl.py
+++ b/magic_pdf/model/sub_modules/layout/layoutlmv3/rcnn_vl.py
-# Copyright (c) Facebook, Inc. and its affiliates.
-import logging
-import numpy as np
-from typing import Dict, List, Optional, Tuple
-import torch
-from torch import nn
-from detectron2.config import configurable
-from detectron2.structures import ImageList, Instances
-from detectron2.utils.events import get_event_storage
-from detectron2.modeling.backbone import Backbone, build_backbone
-from detectron2.modeling.meta_arch.build import META_ARCH_REGISTRY
-from detectron2.modeling.meta_arch import GeneralizedRCNN
-from detectron2.modeling.postprocessing import detector_postprocess
-from detectron2.modeling.roi_heads.fast_rcnn import fast_rcnn_inference_single_image
-from contextlib import contextmanager
-from itertools import count
-@META_ARCH_REGISTRY.register()
-class VLGeneralizedRCNN(GeneralizedRCNN):
-    """
-    Generalized R-CNN. Any models that contains the following three components:
-    1. Per-image feature extraction (aka backbone)
-    2. Region proposal generation
-    3. Per-region feature extraction and prediction
-    """
-    def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):
-        """
-        Args:
-            batched_inputs: a list, batched outputs of :class:`DatasetMapper` .
-                Each item in the list contains the inputs for one image.
-                For now, each item in the list is a dict that contains:
-                * image: Tensor, image in (C, H, W) format.
-                * instances (optional): groundtruth :class:`Instances`
-                * proposals (optional): :class:`Instances`, precomputed proposals.
-                Other information that's included in the original dicts, such as:
-                * "height", "width" (int): the output resolution of the model, used in inference.
-                  See :meth:`postprocess` for details.
-        Returns:
-            list[dict]:
-                Each dict is the output for one input image.
-                The dict contains one key "instances" whose value is a :class:`Instances`.
-                The :class:`Instances` object has the following keys:
-                "pred_boxes", "pred_classes", "scores", "pred_masks", "pred_keypoints"
-        """
-        if not self.training:
-            return self.inference(batched_inputs)
-        images = self.preprocess_image(batched_inputs)
-        if "instances" in batched_inputs[0]:
-            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
-        else:
-            gt_instances = None
-        # features = self.backbone(images.tensor)
-        input = self.get_batch(batched_inputs, images)
-        features = self.backbone(input)
-        if self.proposal_generator is not None:
-            proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
-        else:
-            assert "proposals" in batched_inputs[0]
-            proposals = [x["proposals"].to(self.device) for x in batched_inputs]
-            proposal_losses = {}
-        _, detector_losses = self.roi_heads(images, features, proposals, gt_instances)
-        if self.vis_period > 0:
-            storage = get_event_storage()
-            if storage.iter % self.vis_period == 0:
-                self.visualize_training(batched_inputs, proposals)
-        losses = {}
-        losses.update(detector_losses)
-        losses.update(proposal_losses)
-        return losses
-    def inference(
-        self,
-        batched_inputs: List[Dict[str, torch.Tensor]],
-        detected_instances: Optional[List[Instances]] = None,
-        do_postprocess: bool = True,
-    ):
-        """
-        Run inference on the given inputs.
-        Args:
-            batched_inputs (list[dict]): same as in :meth:`forward`
-            detected_instances (None or list[Instances]): if not None, it
-                contains an `Instances` object per image. The `Instances`
-                object contains "pred_boxes" and "pred_classes" which are
-                known boxes in the image.
-                The inference will then skip the detection of bounding boxes,
-                and only predict other per-ROI outputs.
-            do_postprocess (bool): whether to apply post-processing on the outputs.
-        Returns:
-            When do_postprocess=True, same as in :meth:`forward`.
-            Otherwise, a list[Instances] containing raw network outputs.
-        """
-        assert not self.training
-        images = self.preprocess_image(batched_inputs)
-        # features = self.backbone(images.tensor)
-        input = self.get_batch(batched_inputs, images)
-        features = self.backbone(input)
-        if detected_instances is None:
-            if self.proposal_generator is not None:
-                proposals, _ = self.proposal_generator(images, features, None)
-            else:
-                assert "proposals" in batched_inputs[0]
-                proposals = [x["proposals"].to(self.device) for x in batched_inputs]
-            results, _ = self.roi_heads(images, features, proposals, None)
-        else:
-            detected_instances = [x.to(self.device) for x in detected_instances]
-            results = self.roi_heads.forward_with_given_boxes(features, detected_instances)
-        if do_postprocess:
-            assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess."
-            return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes)
-        else:
-            return results
-    def get_batch(self, examples, images):
-        if len(examples) >= 1 and "bbox" not in examples[0]:  # image_only
-            return {"images": images.tensor}
-        return input
-    def _batch_inference(self, batched_inputs, detected_instances=None):
-        """
-        Execute inference on a list of inputs,
-        using batch size = self.batch_size (e.g., 2), instead of the length of the list.
-        Inputs & outputs have the same format as :meth:`GeneralizedRCNN.inference`
-        """
-        if detected_instances is None:
-            detected_instances = [None] * len(batched_inputs)
-        outputs = []
-        inputs, instances = [], []
-        for idx, input, instance in zip(count(), batched_inputs, detected_instances):
-            inputs.append(input)
-            instances.append(instance)
-            if len(inputs) == 2 or idx == len(batched_inputs) - 1:
-                outputs.extend(
-                    self.inference(
-                        inputs,
-                        instances if instances[0] is not None else None,
-                        do_postprocess=True,  # False
-                    )
-                )
-                inputs, instances = [], []
-        return outputs
--- a/magic_pdf/model/sub_modules/layout/layoutlmv3/visualizer.py
+++ b/magic_pdf/model/sub_modules/layout/layoutlmv3/visualizer.py
-# Copyright (c) Facebook, Inc. and its affiliates.
-import colorsys
-import logging
-import math
-import numpy as np
-from enum import Enum, unique
-import cv2
-import matplotlib as mpl
-import matplotlib.colors as mplc
-import matplotlib.figure as mplfigure
-import pycocotools.mask as mask_util
-import torch
-from matplotlib.backends.backend_agg import FigureCanvasAgg
-from PIL import Image
-from detectron2.data import MetadataCatalog
-from detectron2.structures import BitMasks, Boxes, BoxMode, Keypoints, PolygonMasks, RotatedBoxes
-from detectron2.utils.file_io import PathManager
-from detectron2.utils.colormap import random_color
-import pdb
-logger = logging.getLogger(__name__)
-__all__ = ["ColorMode", "VisImage", "Visualizer"]
-_SMALL_OBJECT_AREA_THRESH = 1000
-_LARGE_MASK_AREA_THRESH = 120000
-_OFF_WHITE = (1.0, 1.0, 240.0 / 255)
-_BLACK = (0, 0, 0)
-_RED = (1.0, 0, 0)
-_KEYPOINT_THRESHOLD = 0.05
-#CLASS_NAMES = ["footnote", "footer", "header"]
-@unique
-class ColorMode(Enum):
-    """
-    Enum of different color modes to use for instance visualizations.
-    """
-    IMAGE = 0
-    """
-    Picks a random color for every instance and overlay segmentations with low opacity.
-    """
-    SEGMENTATION = 1
-    """
-    Let instances of the same category have similar colors
-    (from metadata.thing_colors), and overlay them with
-    high opacity. This provides more attention on the quality of segmentation.
-    """
-    IMAGE_BW = 2
-    """
-    Same as IMAGE, but convert all areas without masks to gray-scale.
-    Only available for drawing per-instance mask predictions.
-    """
-class GenericMask:
-    """
-    Attribute:
-        polygons (list[ndarray]): list[ndarray]: polygons for this mask.
-            Each ndarray has format [x, y, x, y, ...]
-        mask (ndarray): a binary mask
-    """
-    def __init__(self, mask_or_polygons, height, width):
-        self._mask = self._polygons = self._has_holes = None
-        self.height = height
-        self.width = width
-        m = mask_or_polygons
-        if isinstance(m, dict):
-            # RLEs
-            assert "counts" in m and "size" in m
-            if isinstance(m["counts"], list):  # uncompressed RLEs
-                h, w = m["size"]
-                assert h == height and w == width
-                m = mask_util.frPyObjects(m, h, w)
-            self._mask = mask_util.decode(m)[:, :]
-            return
-        if isinstance(m, list):  # list[ndarray]
-            self._polygons = [np.asarray(x).reshape(-1) for x in m]
-            return
-        if isinstance(m, np.ndarray):  # assumed to be a binary mask
-            assert m.shape[1] != 2, m.shape
-            assert m.shape == (
-                height,
-                width,
-            ), f"mask shape: {m.shape}, target dims: {height}, {width}"
-            self._mask = m.astype("uint8")
-            return
-        raise ValueError("GenericMask cannot handle object {} of type '{}'".format(m, type(m)))
-    @property
-    def mask(self):
-        if self._mask is None:
-            self._mask = self.polygons_to_mask(self._polygons)
-        return self._mask
-    @property
-    def polygons(self):
-        if self._polygons is None:
-            self._polygons, self._has_holes = self.mask_to_polygons(self._mask)
-        return self._polygons
-    @property
-    def has_holes(self):
-        if self._has_holes is None:
-            if self._mask is not None:
-                self._polygons, self._has_holes = self.mask_to_polygons(self._mask)
-            else:
-                self._has_holes = False  # if original format is polygon, does not have holes
-        return self._has_holes
-    def mask_to_polygons(self, mask):
-        # cv2.RETR_CCOMP flag retrieves all the contours and arranges them to a 2-level
-        # hierarchy. External contours (boundary) of the object are placed in hierarchy-1.
-        # Internal contours (holes) are placed in hierarchy-2.
-        # cv2.CHAIN_APPROX_NONE flag gets vertices of polygons from contours.
-        mask = np.ascontiguousarray(mask)  # some versions of cv2 does not support incontiguous arr
-        res = cv2.findContours(mask.astype("uint8"), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE)
-        hierarchy = res[-1]
-        if hierarchy is None:  # empty mask
-            return [], False
-        has_holes = (hierarchy.reshape(-1, 4)[:, 3] >= 0).sum() > 0
-        res = res[-2]
-        res = [x.flatten() for x in res]
-        # These coordinates from OpenCV are integers in range [0, W-1 or H-1].
-        # We add 0.5 to turn them into real-value coordinate space. A better solution
-        # would be to first +0.5 and then dilate the returned polygon by 0.5.
-        res = [x + 0.5 for x in res if len(x) >= 6]
-        return res, has_holes
-    def polygons_to_mask(self, polygons):
-        rle = mask_util.frPyObjects(polygons, self.height, self.width)
-        rle = mask_util.merge(rle)
-        return mask_util.decode(rle)[:, :]
-    def area(self):
-        return self.mask.sum()
-    def bbox(self):
-        p = mask_util.frPyObjects(self.polygons, self.height, self.width)
-        p = mask_util.merge(p)
-        bbox = mask_util.toBbox(p)
-        bbox[2] += bbox[0]
-        bbox[3] += bbox[1]
-        return bbox
-class _PanopticPrediction:
-    """
-    Unify different panoptic annotation/prediction formats
-    """
-    def __init__(self, panoptic_seg, segments_info, metadata=None):
-        if segments_info is None:
-            assert metadata is not None
-            # If "segments_info" is None, we assume "panoptic_img" is a
-            # H*W int32 image storing the panoptic_id in the format of
-            # category_id * label_divisor + instance_id. We reserve -1 for
-            # VOID label.
-            label_divisor = metadata.label_divisor
-            segments_info = []
-            for panoptic_label in np.unique(panoptic_seg.numpy()):
-                if panoptic_label == -1:
-                    # VOID region.
-                    continue
-                pred_class = panoptic_label // label_divisor
-                isthing = pred_class in metadata.thing_dataset_id_to_contiguous_id.values()
-                segments_info.append(
-                    {
-                        "id": int(panoptic_label),
-                        "category_id": int(pred_class),
-                        "isthing": bool(isthing),
-                    }
-                )
-        del metadata
-        self._seg = panoptic_seg
-        self._sinfo = {s["id"]: s for s in segments_info}  # seg id -> seg info
-        segment_ids, areas = torch.unique(panoptic_seg, sorted=True, return_counts=True)
-        areas = areas.numpy()
-        sorted_idxs = np.argsort(-areas)
-        self._seg_ids, self._seg_areas = segment_ids[sorted_idxs], areas[sorted_idxs]
-        self._seg_ids = self._seg_ids.tolist()
-        for sid, area in zip(self._seg_ids, self._seg_areas):
-            if sid in self._sinfo:
-                self._sinfo[sid]["area"] = float(area)
-    def non_empty_mask(self):
-        """
-        Returns:
-            (H, W) array, a mask for all pixels that have a prediction
-        """
-        empty_ids = []
-        for id in self._seg_ids:
-            if id not in self._sinfo:
-                empty_ids.append(id)
-        if len(empty_ids) == 0:
-            return np.zeros(self._seg.shape, dtype=np.uint8)
-        assert (
-            len(empty_ids) == 1
-        ), ">1 ids corresponds to no labels. This is currently not supported"
-        return (self._seg != empty_ids[0]).numpy().astype(np.bool)
-    def semantic_masks(self):
-        for sid in self._seg_ids:
-            sinfo = self._sinfo.get(sid)
-            if sinfo is None or sinfo["isthing"]:
-                # Some pixels (e.g. id 0 in PanopticFPN) have no instance or semantic predictions.
-                continue
-            yield (self._seg == sid).numpy().astype(np.bool), sinfo
-    def instance_masks(self):
-        for sid in self._seg_ids:
-            sinfo = self._sinfo.get(sid)
-            if sinfo is None or not sinfo["isthing"]:
-                continue
-            mask = (self._seg == sid).numpy().astype(np.bool)
-            if mask.sum() > 0:
-                yield mask, sinfo
-def _create_text_labels(classes, scores, class_names, is_crowd=None):
-    """
-    Args:
-        classes (list[int] or None):
-        scores (list[float] or None):
-        class_names (list[str] or None):
-        is_crowd (list[bool] or None):
-    Returns:
-        list[str] or None
-    """
-    #class_names = CLASS_NAMES
-    labels = None
-    if classes is not None:
-        if class_names is not None and len(class_names) > 0:
-            labels = [class_names[i] for i in classes]
-        else:
-            labels = [str(i) for i in classes]
-    if scores is not None:
-        if labels is None:
-            labels = ["{:.0f}%".format(s * 100) for s in scores]
-        else:
-            labels = ["{} {:.0f}%".format(l, s * 100) for l, s in zip(labels, scores)]
-    if labels is not None and is_crowd is not None:
-        labels = [l + ("|crowd" if crowd else "") for l, crowd in zip(labels, is_crowd)]
-    return labels
-class VisImage:
-    def __init__(self, img, scale=1.0):
-        """
-        Args:
-            img (ndarray): an RGB image of shape (H, W, 3) in range [0, 255].
-            scale (float): scale the input image
-        """
-        self.img = img
-        self.scale = scale
-        self.width, self.height = img.shape[1], img.shape[0]
-        self._setup_figure(img)
-    def _setup_figure(self, img):
-        """
-        Args:
-            Same as in :meth:`__init__()`.
-        Returns:
-            fig (matplotlib.pyplot.figure): top level container for all the image plot elements.
-            ax (matplotlib.pyplot.Axes): contains figure elements and sets the coordinate system.
-        """
-        fig = mplfigure.Figure(frameon=False)
-        self.dpi = fig.get_dpi()
-        # add a small 1e-2 to avoid precision lost due to matplotlib's truncation
-        # (https://github.com/matplotlib/matplotlib/issues/15363)
-        fig.set_size_inches(
-            (self.width * self.scale + 1e-2) / self.dpi,
-            (self.height * self.scale + 1e-2) / self.dpi,
-        )
-        self.canvas = FigureCanvasAgg(fig)
-        # self.canvas = mpl.backends.backend_cairo.FigureCanvasCairo(fig)
-        ax = fig.add_axes([0.0, 0.0, 1.0, 1.0])
-        ax.axis("off")
-        self.fig = fig
-        self.ax = ax
-        self.reset_image(img)
-    def reset_image(self, img):
-        """
-        Args:
-            img: same as in __init__
-        """
-        img = img.astype("uint8")
-        self.ax.imshow(img, extent=(0, self.width, self.height, 0), interpolation="nearest")
-    def save(self, filepath):
-        """
-        Args:
-            filepath (str): a string that contains the absolute path, including the file name, where
-                the visualized image will be saved.
-        """
-        self.fig.savefig(filepath)
-    def get_image(self):
-        """
-        Returns:
-            ndarray:
-                the visualized image of shape (H, W, 3) (RGB) in uint8 type.
-                The shape is scaled w.r.t the input image using the given `scale` argument.
-        """
-        canvas = self.canvas
-        s, (width, height) = canvas.print_to_buffer()
-        # buf = io.BytesIO()  # works for cairo backend
-        # canvas.print_rgba(buf)
-        # width, height = self.width, self.height
-        # s = buf.getvalue()
-        buffer = np.frombuffer(s, dtype="uint8")
-        img_rgba = buffer.reshape(height, width, 4)
-        rgb, alpha = np.split(img_rgba, [3], axis=2)
-        return rgb.astype("uint8")
-class Visualizer:
-    """
-    Visualizer that draws data about detection/segmentation on images.
-    It contains methods like `draw_{text,box,circle,line,binary_mask,polygon}`
-    that draw primitive objects to images, as well as high-level wrappers like
-    `draw_{instance_predictions,sem_seg,panoptic_seg_predictions,dataset_dict}`
-    that draw composite data in some pre-defined style.
-    Note that the exact visualization style for the high-level wrappers are subject to change.
-    Style such as color, opacity, label contents, visibility of labels, or even the visibility
-    of objects themselves (e.g. when the object is too small) may change according
-    to different heuristics, as long as the results still look visually reasonable.
-    To obtain a consistent style, you can implement custom drawing functions with the
-    abovementioned primitive methods instead. If you need more customized visualization
-    styles, you can process the data yourself following their format documented in
-    tutorials (:doc:`/tutorials/models`, :doc:`/tutorials/datasets`). This class does not
-    intend to satisfy everyone's preference on drawing styles.
-    This visualizer focuses on high rendering quality rather than performance. It is not
-    designed to be used for real-time applications.
-    """
-    # TODO implement a fast, rasterized version using OpenCV
-    def __init__(self, img_rgb, metadata=None, scale=1.0, instance_mode=ColorMode.IMAGE):
-        """
-        Args:
-            img_rgb: a numpy array of shape (H, W, C), where H and W correspond to
-                the height and width of the image respectively. C is the number of
-                color channels. The image is required to be in RGB format since that
-                is a requirement of the Matplotlib library. The image is also expected
-                to be in the range [0, 255].
-            metadata (Metadata): dataset metadata (e.g. class names and colors)
-            instance_mode (ColorMode): defines one of the pre-defined style for drawing
-                instances on an image.
-        """
-        self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
-        if metadata is None:
-            metadata = MetadataCatalog.get("__nonexist__")
-        self.metadata = metadata
-        self.output = VisImage(self.img, scale=scale)
-        self.cpu_device = torch.device("cpu")
-        # too small texts are useless, therefore clamp to 9
-        self._default_font_size = max(
-            np.sqrt(self.output.height * self.output.width) // 90, 10 // scale
-        )
-        self._instance_mode = instance_mode
-        self.keypoint_threshold = _KEYPOINT_THRESHOLD
-    def draw_instance_predictions(self, predictions):
-        """
-        Draw instance-level prediction results on an image.
-        Args:
-            predictions (Instances): the output of an instance detection/segmentation
-                model. Following fields will be used to draw:
-                "pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle").
-        Returns:
-            output (VisImage): image object with visualizations.
-        """
-        boxes = predictions.pred_boxes if predictions.has("pred_boxes") else None
-        scores = predictions.scores if predictions.has("scores") else None
-        classes = predictions.pred_classes.tolist() if predictions.has("pred_classes") else None
-        labels = _create_text_labels(classes, scores, self.metadata.get("thing_classes", None))
-        keypoints = predictions.pred_keypoints if predictions.has("pred_keypoints") else None
-        if predictions.has("pred_masks"):
-            masks = np.asarray(predictions.pred_masks)
-            masks = [GenericMask(x, self.output.height, self.output.width) for x in masks]
-        else:
-            masks = None
-        if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"):
-            colors = [
-                self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in classes
-            ]
-            alpha = 0.8
-        else:
-            colors = None
-            alpha = 0.5
-        if self._instance_mode == ColorMode.IMAGE_BW:
-            self.output.reset_image(
-                self._create_grayscale_image(
-                    (predictions.pred_masks.any(dim=0) > 0).numpy()
-                    if predictions.has("pred_masks")
-                    else None
-                )
-            )
-            alpha = 0.3
-        self.overlay_instances(
-            masks=masks,
-            boxes=boxes,
-            labels=labels,
-            keypoints=keypoints,
-            assigned_colors=colors,
-            alpha=alpha,
-        )
-        return self.output
-    def draw_sem_seg(self, sem_seg, area_threshold=None, alpha=0.8):
-        """
-        Draw semantic segmentation predictions/labels.
-        Args:
-            sem_seg (Tensor or ndarray): the segmentation of shape (H, W).
-                Each value is the integer label of the pixel.
-            area_threshold (int): segments with less than `area_threshold` are not drawn.
-            alpha (float): the larger it is, the more opaque the segmentations are.
-        Returns:
-            output (VisImage): image object with visualizations.
-        """
-        if isinstance(sem_seg, torch.Tensor):
-            sem_seg = sem_seg.numpy()
-        labels, areas = np.unique(sem_seg, return_counts=True)
-        sorted_idxs = np.argsort(-areas).tolist()
-        labels = labels[sorted_idxs]
-        for label in filter(lambda l: l < len(self.metadata.stuff_classes), labels):
-            try:
-                mask_color = [x / 255 for x in self.metadata.stuff_colors[label]]
-            except (AttributeError, IndexError):
-                mask_color = None
-            binary_mask = (sem_seg == label).astype(np.uint8)
-            text = self.metadata.stuff_classes[label]
-            self.draw_binary_mask(
-                binary_mask,
-                color=mask_color,
-                edge_color=_OFF_WHITE,
-                text=text,
-                alpha=alpha,
-                area_threshold=area_threshold,
-            )
-        return self.output
-    def draw_panoptic_seg(self, panoptic_seg, segments_info, area_threshold=None, alpha=0.7):
-        """
-        Draw panoptic prediction annotations or results.
-        Args:
-            panoptic_seg (Tensor): of shape (height, width) where the values are ids for each
-                segment.
-            segments_info (list[dict] or None): Describe each segment in `panoptic_seg`.
-                If it is a ``list[dict]``, each dict contains keys "id", "category_id".
-                If None, category id of each pixel is computed by
-                ``pixel // metadata.label_divisor``.
-            area_threshold (int): stuff segments with less than `area_threshold` are not drawn.
-        Returns:
-            output (VisImage): image object with visualizations.
-        """
-        pred = _PanopticPrediction(panoptic_seg, segments_info, self.metadata)
-        if self._instance_mode == ColorMode.IMAGE_BW:
-            self.output.reset_image(self._create_grayscale_image(pred.non_empty_mask()))
-        # draw mask for all semantic segments first i.e. "stuff"
-        for mask, sinfo in pred.semantic_masks():
-            category_idx = sinfo["category_id"]
-            try:
-                mask_color = [x / 255 for x in self.metadata.stuff_colors[category_idx]]
-            except AttributeError:
-                mask_color = None
-            text = self.metadata.stuff_classes[category_idx]
-            self.draw_binary_mask(
-                mask,
-                color=mask_color,
-                edge_color=_OFF_WHITE,
-                text=text,
-                alpha=alpha,
-                area_threshold=area_threshold,
-            )
-        # draw mask for all instances second
-        all_instances = list(pred.instance_masks())
-        if len(all_instances) == 0:
-            return self.output
-        masks, sinfo = list(zip(*all_instances))
-        category_ids = [x["category_id"] for x in sinfo]
-        try:
-            scores = [x["score"] for x in sinfo]
-        except KeyError:
-            scores = None
-        labels = _create_text_labels(
-            category_ids, scores, self.metadata.thing_classes, [x.get("iscrowd", 0) for x in sinfo]
-        )
-        try:
-            colors = [
-                self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in category_ids
-            ]
-        except AttributeError:
-            colors = None
-        self.overlay_instances(masks=masks, labels=labels, assigned_colors=colors, alpha=alpha)
-        return self.output
-    draw_panoptic_seg_predictions = draw_panoptic_seg  # backward compatibility
-    def draw_dataset_dict(self, dic):
-        """
-        Draw annotations/segmentaions in Detectron2 Dataset format.
-        Args:
-            dic (dict): annotation/segmentation data of one image, in Detectron2 Dataset format.
-        Returns:
-            output (VisImage): image object with visualizations.
-        """
-        annos = dic.get("annotations", None)
-        if annos:
-            if "segmentation" in annos[0]:
-                masks = [x["segmentation"] for x in annos]
-            else:
-                masks = None
-            if "keypoints" in annos[0]:
-                keypts = [x["keypoints"] for x in annos]
-                keypts = np.array(keypts).reshape(len(annos), -1, 3)
-            else:
-                keypts = None
-            boxes = [
-                BoxMode.convert(x["bbox"], x["bbox_mode"], BoxMode.XYXY_ABS)
-                if len(x["bbox"]) == 4
-                else x["bbox"]
-                for x in annos
-            ]
-            colors = None
-            category_ids = [x["category_id"] for x in annos]
-            if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"):
-                colors = [
-                    self._jitter([x / 255 for x in self.metadata.thing_colors[c]])
-                    for c in category_ids
-                ]
-            names = self.metadata.get("thing_classes", None)
-            labels = _create_text_labels(
-                category_ids,
-                scores=None,
-                class_names=names,
-                is_crowd=[x.get("iscrowd", 0) for x in annos],
-            )
-            self.overlay_instances(
-                labels=labels, boxes=boxes, masks=masks, keypoints=keypts, assigned_colors=colors
-            )
-        sem_seg = dic.get("sem_seg", None)
-        if sem_seg is None and "sem_seg_file_name" in dic:
-            with PathManager.open(dic["sem_seg_file_name"], "rb") as f:
-                sem_seg = Image.open(f)
-                sem_seg = np.asarray(sem_seg, dtype="uint8")
-        if sem_seg is not None:
-            self.draw_sem_seg(sem_seg, area_threshold=0, alpha=0.5)
-        pan_seg = dic.get("pan_seg", None)
-        if pan_seg is None and "pan_seg_file_name" in dic:
-            with PathManager.open(dic["pan_seg_file_name"], "rb") as f:
-                pan_seg = Image.open(f)
-                pan_seg = np.asarray(pan_seg)
-                from panopticapi.utils import rgb2id
-                pan_seg = rgb2id(pan_seg)
-        if pan_seg is not None:
-            segments_info = dic["segments_info"]
-            pan_seg = torch.tensor(pan_seg)
-            self.draw_panoptic_seg(pan_seg, segments_info, area_threshold=0, alpha=0.5)
-        return self.output
-    def overlay_instances(
-        self,
-        *,
-        boxes=None,
-        labels=None,
-        masks=None,
-        keypoints=None,
-        assigned_colors=None,
-        alpha=0.5,
-    ):
-        """
-        Args:
-            boxes (Boxes, RotatedBoxes or ndarray): either a :class:`Boxes`,
-                or an Nx4 numpy array of XYXY_ABS format for the N objects in a single image,
-                or a :class:`RotatedBoxes`,
-                or an Nx5 numpy array of (x_center, y_center, width, height, angle_degrees) format
-                for the N objects in a single image,
-            labels (list[str]): the text to be displayed for each instance.
-            masks (masks-like object): Supported types are:
-                * :class:`detectron2.structures.PolygonMasks`,
-                  :class:`detectron2.structures.BitMasks`.
-                * list[list[ndarray]]: contains the segmentation masks for all objects in one image.
-                  The first level of the list corresponds to individual instances. The second
-                  level to all the polygon that compose the instance, and the third level
-                  to the polygon coordinates. The third level should have the format of
-                  [x0, y0, x1, y1, ..., xn, yn] (n >= 3).
-                * list[ndarray]: each ndarray is a binary mask of shape (H, W).
-                * list[dict]: each dict is a COCO-style RLE.
-            keypoints (Keypoint or array like): an array-like object of shape (N, K, 3),
-                where the N is the number of instances and K is the number of keypoints.
-                The last dimension corresponds to (x, y, visibility or score).
-            assigned_colors (list[matplotlib.colors]): a list of colors, where each color
-                corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
-                for full list of formats that the colors are accepted in.
-        Returns:
-            output (VisImage): image object with visualizations.
-        """
-        num_instances = 0
-        if boxes is not None:
-            boxes = self._convert_boxes(boxes)
-            num_instances = len(boxes)
-        if masks is not None:
-            masks = self._convert_masks(masks)
-            if num_instances:
-                assert len(masks) == num_instances
-            else:
-                num_instances = len(masks)
-        if keypoints is not None:
-            if num_instances:
-                assert len(keypoints) == num_instances
-            else:
-                num_instances = len(keypoints)
-            keypoints = self._convert_keypoints(keypoints)
-        if labels is not None:
-            assert len(labels) == num_instances
-        if assigned_colors is None:
-            assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)]
-        if num_instances == 0:
-            return self.output
-        if boxes is not None and boxes.shape[1] == 5:
-            return self.overlay_rotated_instances(
-                boxes=boxes, labels=labels, assigned_colors=assigned_colors
-            )
-        # Display in largest to smallest order to reduce occlusion.
-        areas = None
-        if boxes is not None:
-            areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1)
-        elif masks is not None:
-            areas = np.asarray([x.area() for x in masks])
-        if areas is not None:
-            sorted_idxs = np.argsort(-areas).tolist()
-            # Re-order overlapped instances in descending order.
-            boxes = boxes[sorted_idxs] if boxes is not None else None
-            labels = [labels[k] for k in sorted_idxs] if labels is not None else None
-            masks = [masks[idx] for idx in sorted_idxs] if masks is not None else None
-            assigned_colors = [assigned_colors[idx] for idx in sorted_idxs]
-            keypoints = keypoints[sorted_idxs] if keypoints is not None else None
-        for i in range(num_instances):
-            color = assigned_colors[i]
-            if boxes is not None:
-                self.draw_box(boxes[i], edge_color=color)
-            if masks is not None:
-                for segment in masks[i].polygons:
-                    self.draw_polygon(segment.reshape(-1, 2), color, alpha=alpha)
-            if labels is not None:
-                # first get a box
-                if boxes is not None:
-                    x0, y0, x1, y1 = boxes[i]
-                    text_pos = (x0, y0)  # if drawing boxes, put text on the box corner.
-                    horiz_align = "left"
-                elif masks is not None:
-                    # skip small mask without polygon
-                    if len(masks[i].polygons) == 0:
-                        continue
-                    x0, y0, x1, y1 = masks[i].bbox()
-                    # draw text in the center (defined by median) when box is not drawn
-                    # median is less sensitive to outliers.
-                    text_pos = np.median(masks[i].mask.nonzero(), axis=1)[::-1]
-                    horiz_align = "center"
-                else:
-                    continue  # drawing the box confidence for keypoints isn't very useful.
-                # for small objects, draw text at the side to avoid occlusion
-                instance_area = (y1 - y0) * (x1 - x0)
-                if (
-                    instance_area < _SMALL_OBJECT_AREA_THRESH * self.output.scale
-                    or y1 - y0 < 40 * self.output.scale
-                ):
-                    if y1 >= self.output.height - 5:
-                        text_pos = (x1, y0)
-                    else:
-                        text_pos = (x0, y1)
-                height_ratio = (y1 - y0) / np.sqrt(self.output.height * self.output.width)
-                lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
-                font_size = (
-                    np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2)
-                    * 0.5
-                    * self._default_font_size
-                )
-                self.draw_text(
-                    labels[i],
-                    text_pos,
-                    color=lighter_color,
-                    horizontal_alignment=horiz_align,
-                    font_size=font_size,
-                )
-        # draw keypoints
-        if keypoints is not None:
-            for keypoints_per_instance in keypoints:
-                self.draw_and_connect_keypoints(keypoints_per_instance)
-        return self.output
-    def overlay_rotated_instances(self, boxes=None, labels=None, assigned_colors=None):
-        """
-        Args:
-            boxes (ndarray): an Nx5 numpy array of
-                (x_center, y_center, width, height, angle_degrees) format
-                for the N objects in a single image.
-            labels (list[str]): the text to be displayed for each instance.
-            assigned_colors (list[matplotlib.colors]): a list of colors, where each color
-                corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
-                for full list of formats that the colors are accepted in.
-        Returns:
-            output (VisImage): image object with visualizations.
-        """
-        num_instances = len(boxes)
-        if assigned_colors is None:
-            assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)]
-        if num_instances == 0:
-            return self.output
-        # Display in largest to smallest order to reduce occlusion.
-        if boxes is not None:
-            areas = boxes[:, 2] * boxes[:, 3]
-        sorted_idxs = np.argsort(-areas).tolist()
-        # Re-order overlapped instances in descending order.
-        boxes = boxes[sorted_idxs]
-        labels = [labels[k] for k in sorted_idxs] if labels is not None else None
-        colors = [assigned_colors[idx] for idx in sorted_idxs]
-        for i in range(num_instances):
-            self.draw_rotated_box_with_label(
-                boxes[i], edge_color=colors[i], label=labels[i] if labels is not None else None
-            )
-        return self.output
-    def draw_and_connect_keypoints(self, keypoints):
-        """
-        Draws keypoints of an instance and follows the rules for keypoint connections
-        to draw lines between appropriate keypoints. This follows color heuristics for
-        line color.
-        Args:
-            keypoints (Tensor): a tensor of shape (K, 3), where K is the number of keypoints
-                and the last dimension corresponds to (x, y, probability).
-        Returns:
-            output (VisImage): image object with visualizations.
-        """
-        visible = {}
-        keypoint_names = self.metadata.get("keypoint_names")
-        for idx, keypoint in enumerate(keypoints):
-            # draw keypoint
-            x, y, prob = keypoint
-            if prob > self.keypoint_threshold:
-                self.draw_circle((x, y), color=_RED)
-                if keypoint_names:
-                    keypoint_name = keypoint_names[idx]
-                    visible[keypoint_name] = (x, y)
-        if self.metadata.get("keypoint_connection_rules"):
-            for kp0, kp1, color in self.metadata.keypoint_connection_rules:
-                if kp0 in visible and kp1 in visible:
-                    x0, y0 = visible[kp0]
-                    x1, y1 = visible[kp1]
-                    color = tuple(x / 255.0 for x in color)
-                    self.draw_line([x0, x1], [y0, y1], color=color)
-        # draw lines from nose to mid-shoulder and mid-shoulder to mid-hip
-        # Note that this strategy is specific to person keypoints.
-        # For other keypoints, it should just do nothing
-        try:
-            ls_x, ls_y = visible["left_shoulder"]
-            rs_x, rs_y = visible["right_shoulder"]
-            mid_shoulder_x, mid_shoulder_y = (ls_x + rs_x) / 2, (ls_y + rs_y) / 2
-        except KeyError:
-            pass
-        else:
-            # draw line from nose to mid-shoulder
-            nose_x, nose_y = visible.get("nose", (None, None))
-            if nose_x is not None:
-                self.draw_line([nose_x, mid_shoulder_x], [nose_y, mid_shoulder_y], color=_RED)
-            try:
-                # draw line from mid-shoulder to mid-hip
-                lh_x, lh_y = visible["left_hip"]
-                rh_x, rh_y = visible["right_hip"]
-            except KeyError:
-                pass
-            else:
-                mid_hip_x, mid_hip_y = (lh_x + rh_x) / 2, (lh_y + rh_y) / 2
-                self.draw_line([mid_hip_x, mid_shoulder_x], [mid_hip_y, mid_shoulder_y], color=_RED)
-        return self.output
-    """
-    Primitive drawing functions:
-    """
-    def draw_text(
-        self,
-        text,
-        position,
-        *,
-        font_size=None,
-        color="g",
-        horizontal_alignment="center",
-        rotation=0,
-    ):
-        """
-        Args:
-            text (str): class label
-            position (tuple): a tuple of the x and y coordinates to place text on image.
-            font_size (int, optional): font of the text. If not provided, a font size
-                proportional to the image width is calculated and used.
-            color: color of the text. Refer to `matplotlib.colors` for full list
-                of formats that are accepted.
-            horizontal_alignment (str): see `matplotlib.text.Text`
-            rotation: rotation angle in degrees CCW
-        Returns:
-            output (VisImage): image object with text drawn.
-        """
-        if not font_size:
-            font_size = self._default_font_size
-        # since the text background is dark, we don't want the text to be dark
-        color = np.maximum(list(mplc.to_rgb(color)), 0.2)
-        color[np.argmax(color)] = max(0.8, np.max(color))
-        x, y = position
-        self.output.ax.text(
-            x,
-            y,
-            text,
-            size=font_size * self.output.scale,
-            family="sans-serif",
-            bbox={"facecolor": "black", "alpha": 0.8, "pad": 0.7, "edgecolor": "none"},
-            verticalalignment="top",
-            horizontalalignment=horizontal_alignment,
-            color=color,
-            zorder=10,
-            rotation=rotation,
-        )
-        return self.output
-    def draw_box(self, box_coord, alpha=0.5, edge_color="g", line_style="-"):
-        """
-        Args:
-            box_coord (tuple): a tuple containing x0, y0, x1, y1 coordinates, where x0 and y0
-                are the coordinates of the image's top left corner. x1 and y1 are the
-                coordinates of the image's bottom right corner.
-            alpha (float): blending efficient. Smaller values lead to more transparent masks.
-            edge_color: color of the outline of the box. Refer to `matplotlib.colors`
-                for full list of formats that are accepted.
-            line_style (string): the string to use to create the outline of the boxes.
-        Returns:
-            output (VisImage): image object with box drawn.
-        """
-        x0, y0, x1, y1 = box_coord
-        width = x1 - x0
-        height = y1 - y0
-        linewidth = max(self._default_font_size / 4, 1)
-        self.output.ax.add_patch(
-            mpl.patches.Rectangle(
-                (x0, y0),
-                width,
-                height,
-                fill=False,
-                edgecolor=edge_color,
-                linewidth=linewidth * self.output.scale,
-                alpha=alpha,
-                linestyle=line_style,
-            )
-        )
-        return self.output
-    def draw_rotated_box_with_label(
-        self, rotated_box, alpha=0.5, edge_color="g", line_style="-", label=None
-    ):
-        """
-        Draw a rotated box with label on its top-left corner.
-        Args:
-            rotated_box (tuple): a tuple containing (cnt_x, cnt_y, w, h, angle),
-                where cnt_x and cnt_y are the center coordinates of the box.
-                w and h are the width and height of the box. angle represents how
-                many degrees the box is rotated CCW with regard to the 0-degree box.
-            alpha (float): blending efficient. Smaller values lead to more transparent masks.
-            edge_color: color of the outline of the box. Refer to `matplotlib.colors`
-                for full list of formats that are accepted.
-            line_style (string): the string to use to create the outline of the boxes.
-            label (string): label for rotated box. It will not be rendered when set to None.
-        Returns:
-            output (VisImage): image object with box drawn.
-        """
-        cnt_x, cnt_y, w, h, angle = rotated_box
-        area = w * h
-        # use thinner lines when the box is small
-        linewidth = self._default_font_size / (
-            6 if area < _SMALL_OBJECT_AREA_THRESH * self.output.scale else 3
-        )
-        theta = angle * math.pi / 180.0
-        c = math.cos(theta)
-        s = math.sin(theta)
-        rect = [(-w / 2, h / 2), (-w / 2, -h / 2), (w / 2, -h / 2), (w / 2, h / 2)]
-        # x: left->right ; y: top->down
-        rotated_rect = [(s * yy + c * xx + cnt_x, c * yy - s * xx + cnt_y) for (xx, yy) in rect]
-        for k in range(4):
-            j = (k + 1) % 4
-            self.draw_line(
-                [rotated_rect[k][0], rotated_rect[j][0]],
-                [rotated_rect[k][1], rotated_rect[j][1]],
-                color=edge_color,
-                linestyle="--" if k == 1 else line_style,
-                linewidth=linewidth,
-            )
-        if label is not None:
-            text_pos = rotated_rect[1]  # topleft corner
-            height_ratio = h / np.sqrt(self.output.height * self.output.width)
-            label_color = self._change_color_brightness(edge_color, brightness_factor=0.7)
-            font_size = (
-                np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2) * 0.5 * self._default_font_size
-            )
-            self.draw_text(label, text_pos, color=label_color, font_size=font_size, rotation=angle)
-        return self.output
-    def draw_circle(self, circle_coord, color, radius=3):
-        """
-        Args:
-            circle_coord (list(int) or tuple(int)): contains the x and y coordinates
-                of the center of the circle.
-            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
-                formats that are accepted.
-            radius (int): radius of the circle.
-        Returns:
-            output (VisImage): image object with box drawn.
-        """
-        x, y = circle_coord
-        self.output.ax.add_patch(
-            mpl.patches.Circle(circle_coord, radius=radius, fill=True, color=color)
-        )
-        return self.output
-    def draw_line(self, x_data, y_data, color, linestyle="-", linewidth=None):
-        """
-        Args:
-            x_data (list[int]): a list containing x values of all the points being drawn.
-                Length of list should match the length of y_data.
-            y_data (list[int]): a list containing y values of all the points being drawn.
-                Length of list should match the length of x_data.
-            color: color of the line. Refer to `matplotlib.colors` for a full list of
-                formats that are accepted.
-            linestyle: style of the line. Refer to `matplotlib.lines.Line2D`
-                for a full list of formats that are accepted.
-            linewidth (float or None): width of the line. When it's None,
-                a default value will be computed and used.
-        Returns:
-            output (VisImage): image object with line drawn.
-        """
-        if linewidth is None:
-            linewidth = self._default_font_size / 3
-        linewidth = max(linewidth, 1)
-        self.output.ax.add_line(
-            mpl.lines.Line2D(
-                x_data,
-                y_data,
-                linewidth=linewidth * self.output.scale,
-                color=color,
-                linestyle=linestyle,
-            )
-        )
-        return self.output
-    def draw_binary_mask(
-        self, binary_mask, color=None, *, edge_color=None, text=None, alpha=0.5, area_threshold=0
-    ):
-        """
-        Args:
-            binary_mask (ndarray): numpy array of shape (H, W), where H is the image height and
-                W is the image width. Each value in the array is either a 0 or 1 value of uint8
-                type.
-            color: color of the mask. Refer to `matplotlib.colors` for a full list of
-                formats that are accepted. If None, will pick a random color.
-            edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
-                full list of formats that are accepted.
-            text (str): if None, will be drawn in the object's center of mass.
-            alpha (float): blending efficient. Smaller values lead to more transparent masks.
-            area_threshold (float): a connected component small than this will not be shown.
-        Returns:
-            output (VisImage): image object with mask drawn.
-        """
-        if color is None:
-            color = random_color(rgb=True, maximum=1)
-        color = mplc.to_rgb(color)
-        has_valid_segment = False
-        binary_mask = binary_mask.astype("uint8")  # opencv needs uint8
-        mask = GenericMask(binary_mask, self.output.height, self.output.width)
-        shape2d = (binary_mask.shape[0], binary_mask.shape[1])
-        if not mask.has_holes:
-            # draw polygons for regular masks
-            for segment in mask.polygons:
-                area = mask_util.area(mask_util.frPyObjects([segment], shape2d[0], shape2d[1]))
-                if area < (area_threshold or 0):
-                    continue
-                has_valid_segment = True
-                segment = segment.reshape(-1, 2)
-                self.draw_polygon(segment, color=color, edge_color=edge_color, alpha=alpha)
-        else:
-            # TODO: Use Path/PathPatch to draw vector graphics:
-            # https://stackoverflow.com/questions/8919719/how-to-plot-a-complex-polygon
-            rgba = np.zeros(shape2d + (4,), dtype="float32")
-            rgba[:, :, :3] = color
-            rgba[:, :, 3] = (mask.mask == 1).astype("float32") * alpha
-            has_valid_segment = True
-            self.output.ax.imshow(rgba, extent=(0, self.output.width, self.output.height, 0))
-        if text is not None and has_valid_segment:
-            # TODO sometimes drawn on wrong objects. the heuristics here can improve.
-            lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
-            _num_cc, cc_labels, stats, centroids = cv2.connectedComponentsWithStats(binary_mask, 8)
-            largest_component_id = np.argmax(stats[1:, -1]) + 1
-            # draw text on the largest component, as well as other very large components.
-            for cid in range(1, _num_cc):
-                if cid == largest_component_id or stats[cid, -1] > _LARGE_MASK_AREA_THRESH:
-                    # median is more stable than centroid
-                    # center = centroids[largest_component_id]
-                    center = np.median((cc_labels == cid).nonzero(), axis=1)[::-1]
-                    self.draw_text(text, center, color=lighter_color)
-        return self.output
-    def draw_polygon(self, segment, color, edge_color=None, alpha=0.5):
-        """
-        Args:
-            segment: numpy array of shape Nx2, containing all the points in the polygon.
-            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
-                formats that are accepted.
-            edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
-                full list of formats that are accepted. If not provided, a darker shade
-                of the polygon color will be used instead.
-            alpha (float): blending efficient. Smaller values lead to more transparent masks.
-        Returns:
-            output (VisImage): image object with polygon drawn.
-        """
-        if edge_color is None:
-            # make edge color darker than the polygon color
-            if alpha > 0.8:
-                edge_color = self._change_color_brightness(color, brightness_factor=-0.7)
-            else:
-                edge_color = color
-        edge_color = mplc.to_rgb(edge_color) + (1,)
-        polygon = mpl.patches.Polygon(
-            segment,
-            fill=True,
-            facecolor=mplc.to_rgb(color) + (alpha,),
-            edgecolor=edge_color,
-            linewidth=max(self._default_font_size // 15 * self.output.scale, 1),
-        )
-        self.output.ax.add_patch(polygon)
-        return self.output
-    """
-    Internal methods:
-    """
-    def _jitter(self, color):
-        """
-        Randomly modifies given color to produce a slightly different color than the color given.
-        Args:
-            color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color
-                picked. The values in the list are in the [0.0, 1.0] range.
-        Returns:
-            jittered_color (tuple[double]): a tuple of 3 elements, containing the RGB values of the
-                color after being jittered. The values in the list are in the [0.0, 1.0] range.
-        """
-        color = mplc.to_rgb(color)
-        vec = np.random.rand(3)
-        # better to do it in another color space
-        vec = vec / np.linalg.norm(vec) * 0.5
-        res = np.clip(vec + color, 0, 1)
-        return tuple(res)
-    def _create_grayscale_image(self, mask=None):
-        """
-        Create a grayscale version of the original image.
-        The colors in masked area, if given, will be kept.
-        """
-        img_bw = self.img.astype("f4").mean(axis=2)
-        img_bw = np.stack([img_bw] * 3, axis=2)
-        if mask is not None:
-            img_bw[mask] = self.img[mask]
-        return img_bw
-    def _change_color_brightness(self, color, brightness_factor):
-        """
-        Depending on the brightness_factor, gives a lighter or darker color i.e. a color with
-        less or more saturation than the original color.
-        Args:
-            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
-                formats that are accepted.
-            brightness_factor (float): a value in [-1.0, 1.0] range. A lightness factor of
-                0 will correspond to no change, a factor in [-1.0, 0) range will result in
-                a darker color and a factor in (0, 1.0] range will result in a lighter color.
-        Returns:
-            modified_color (tuple[double]): a tuple containing the RGB values of the
-                modified color. Each value in the tuple is in the [0.0, 1.0] range.
-        """
-        assert brightness_factor >= -1.0 and brightness_factor <= 1.0
-        color = mplc.to_rgb(color)
-        polygon_color = colorsys.rgb_to_hls(*mplc.to_rgb(color))
-        modified_lightness = polygon_color[1] + (brightness_factor * polygon_color[1])
-        modified_lightness = 0.0 if modified_lightness < 0.0 else modified_lightness
-        modified_lightness = 1.0 if modified_lightness > 1.0 else modified_lightness
-        modified_color = colorsys.hls_to_rgb(polygon_color[0], modified_lightness, polygon_color[2])
-        return modified_color
-    def _convert_boxes(self, boxes):
-        """
-        Convert different format of boxes to an NxB array, where B = 4 or 5 is the box dimension.
-        """
-        if isinstance(boxes, Boxes) or isinstance(boxes, RotatedBoxes):
-            return boxes.tensor.detach().numpy()
-        else:
-            return np.asarray(boxes)
-    def _convert_masks(self, masks_or_polygons):
-        """
-        Convert different format of masks or polygons to a tuple of masks and polygons.
-        Returns:
-            list[GenericMask]:
-        """
-        m = masks_or_polygons
-        if isinstance(m, PolygonMasks):
-            m = m.polygons
-        if isinstance(m, BitMasks):
-            m = m.tensor.numpy()
-        if isinstance(m, torch.Tensor):
-            m = m.numpy()
-        ret = []
-        for x in m:
-            if isinstance(x, GenericMask):
-                ret.append(x)
-            else:
-                ret.append(GenericMask(x, self.output.height, self.output.width))
-        return ret
-    def _convert_keypoints(self, keypoints):
-        if isinstance(keypoints, Keypoints):
-            keypoints = keypoints.tensor
-        keypoints = np.asarray(keypoints)
-        return keypoints
-    def get_output(self):
-        """
-        Returns:
-            output (VisImage): the image output containing the visualizations added
-            to the image.
-        """
-        return self.output
--- a/magic_pdf/model/sub_modules/mfd/__init__.py
+++ b/magic_pdf/model/sub_modules/mfd/__init__.py
--- a/magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py
+++ b/magic_pdf/model/sub_modules/mfd/yolov8/YOLOv8.py
-from tqdm import tqdm
-from ultralytics import YOLO
-class YOLOv8MFDModel(object):
-    def __init__(self, weight, device="cpu"):
-        self.mfd_model = YOLO(weight)
-        self.device = device
-    def predict(self, image):
-        mfd_res = self.mfd_model.predict(
-            image, imgsz=1888, conf=0.25, iou=0.45, verbose=False, device=self.device
-        )[0]
-        return mfd_res
-    def batch_predict(self, images: list, batch_size: int) -> list:
-        images_mfd_res = []
-        # for index in range(0, len(images), batch_size):
-        for index in tqdm(range(0, len(images), batch_size), desc="MFD Predict"):
-            mfd_res = [
-                image_res.cpu()
-                for image_res in self.mfd_model.predict(
-                    images[index : index + batch_size],
-                    imgsz=1888,
-                    conf=0.25,
-                    iou=0.45,
-                    verbose=False,
-                    device=self.device,
-                )
-            ]
-            for image_res in mfd_res:
-                images_mfd_res.append(image_res)
-        return images_mfd_res
--- a/magic_pdf/model/sub_modules/mfd/yolov8/__init__.py
+++ b/magic_pdf/model/sub_modules/mfd/yolov8/__init__.py
--- a/magic_pdf/model/sub_modules/mfr/__init__.py
+++ b/magic_pdf/model/sub_modules/mfr/__init__.py
--- a/magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py
+++ b/magic_pdf/model/sub_modules/mfr/unimernet/Unimernet.py
-import torch
-from torch.utils.data import DataLoader, Dataset
-from tqdm import tqdm
-class MathDataset(Dataset):
-    def __init__(self, image_paths, transform=None):
-        self.image_paths = image_paths
-        self.transform = transform
-    def __len__(self):
-        return len(self.image_paths)
-    def __getitem__(self, idx):
-        raw_image = self.image_paths[idx]
-        if self.transform:
-            image = self.transform(raw_image)
-            return image
-class UnimernetModel(object):
-    def __init__(self, weight_dir, cfg_path, _device_="cpu"):
-        from .unimernet_hf import UnimernetModel
-        if _device_.startswith("mps"):
-            self.model = UnimernetModel.from_pretrained(weight_dir, attn_implementation="eager")
-        else:
-            self.model = UnimernetModel.from_pretrained(weight_dir)
-        self.device = _device_
-        self.model.to(_device_)
-        if not _device_.startswith("cpu"):
-            self.model = self.model.to(dtype=torch.float16)
-        self.model.eval()
-    def predict(self, mfd_res, image):
-        formula_list = []
-        mf_image_list = []
-        for xyxy, conf, cla in zip(
-            mfd_res.boxes.xyxy.cpu(), mfd_res.boxes.conf.cpu(), mfd_res.boxes.cls.cpu()
-        ):
-            xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
-            new_item = {
-                "category_id": 13 + int(cla.item()),
-                "poly": [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax],
-                "score": round(float(conf.item()), 2),
-                "latex": "",
-            }
-            formula_list.append(new_item)
-            bbox_img = image[ymin:ymax, xmin:xmax]
-            mf_image_list.append(bbox_img)
-        dataset = MathDataset(mf_image_list, transform=self.model.transform)
-        dataloader = DataLoader(dataset, batch_size=32, num_workers=0)
-        mfr_res = []
-        for mf_img in dataloader:
-            mf_img = mf_img.to(dtype=self.model.dtype)
-            mf_img = mf_img.to(self.device)
-            with torch.no_grad():
-                output = self.model.generate({"image": mf_img})
-            mfr_res.extend(output["fixed_str"])
-        for res, latex in zip(formula_list, mfr_res):
-            res["latex"] = latex
-        return formula_list
-    def batch_predict(self, images_mfd_res: list, images: list, batch_size: int = 64) -> list:
-        images_formula_list = []
-        mf_image_list = []
-        backfill_list = []
-        image_info = []  # Store (area, original_index, image) tuples
-        # Collect images with their original indices
-        for image_index in range(len(images_mfd_res)):
-            mfd_res = images_mfd_res[image_index]
-            np_array_image = images[image_index]
-            formula_list = []
-            for idx, (xyxy, conf, cla) in enumerate(zip(
-                    mfd_res.boxes.xyxy, mfd_res.boxes.conf, mfd_res.boxes.cls
-            )):
-                xmin, ymin, xmax, ymax = [int(p.item()) for p in xyxy]
-                new_item = {
-                    "category_id": 13 + int(cla.item()),
-                    "poly": [xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax],
-                    "score": round(float(conf.item()), 2),
-                    "latex": "",
-                }
-                formula_list.append(new_item)
-                bbox_img = np_array_image[ymin:ymax, xmin:xmax]
-                area = (xmax - xmin) * (ymax - ymin)
-                curr_idx = len(mf_image_list)
-                image_info.append((area, curr_idx, bbox_img))
-                mf_image_list.append(bbox_img)
-            images_formula_list.append(formula_list)
-            backfill_list += formula_list
-        # Stable sort by area
-        image_info.sort(key=lambda x: x[0])  # sort by area
-        sorted_indices = [x[1] for x in image_info]
-        sorted_images = [x[2] for x in image_info]
-        # Create mapping for results
-        index_mapping = {new_idx: old_idx for new_idx, old_idx in enumerate(sorted_indices)}
-        # Create dataset with sorted images
-        dataset = MathDataset(sorted_images, transform=self.model.transform)
-        dataloader = DataLoader(dataset, batch_size=batch_size, num_workers=0)
-        # Process batches and store results
-        mfr_res = []
-        # for mf_img in dataloader:
-        with tqdm(total=len(sorted_images), desc="MFR Predict") as pbar:
-            for index, mf_img in enumerate(dataloader):
-                mf_img = mf_img.to(dtype=self.model.dtype)
-                mf_img = mf_img.to(self.device)
-                with torch.no_grad():
-                    output = self.model.generate({"image": mf_img})
-                mfr_res.extend(output["fixed_str"])
-                # 更新进度条，每次增加batch_size，但要注意最后一个batch可能不足batch_size
-                current_batch_size = min(batch_size, len(sorted_images) - index * batch_size)
-                pbar.update(current_batch_size)
-        # Restore original order
-        unsorted_results = [""] * len(mfr_res)
-        for new_idx, latex in enumerate(mfr_res):
-            original_idx = index_mapping[new_idx]
-            unsorted_results[original_idx] = latex
-        # Fill results back
-        for res, latex in zip(backfill_list, unsorted_results):
-            res["latex"] = latex
-        return images_formula_list
--- a/magic_pdf/model/sub_modules/mfr/unimernet/__init__.py
+++ b/magic_pdf/model/sub_modules/mfr/unimernet/__init__.py
--- a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/__init__.py
+++ b/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/__init__.py
-from .unimer_swin import UnimerSwinConfig, UnimerSwinModel, UnimerSwinImageProcessor
-from .unimer_mbart import UnimerMBartConfig, UnimerMBartModel, UnimerMBartForCausalLM
-from .modeling_unimernet import UnimernetModel
-__all__ = [
-    "UnimerSwinConfig",
-    "UnimerSwinModel",
-    "UnimerSwinImageProcessor",
-    "UnimerMBartConfig",
-    "UnimerMBartModel",
-    "UnimerMBartForCausalLM",
-    "UnimernetModel",
-]
--- a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py
+++ b/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/modeling_unimernet.py
-import os
-import re
-import warnings
-from typing import Optional
-import torch
-from ftfy import fix_text
-from loguru import logger
-from transformers import AutoConfig, AutoModel, AutoModelForCausalLM, AutoTokenizer, PretrainedConfig, PreTrainedModel
-from transformers import VisionEncoderDecoderConfig, VisionEncoderDecoderModel
-from transformers.models.vision_encoder_decoder.modeling_vision_encoder_decoder import logger as base_model_logger
-from .unimer_swin import UnimerSwinConfig, UnimerSwinModel, UnimerSwinImageProcessor
-from .unimer_mbart import UnimerMBartConfig, UnimerMBartForCausalLM
-AutoConfig.register(UnimerSwinConfig.model_type, UnimerSwinConfig)
-AutoConfig.register(UnimerMBartConfig.model_type, UnimerMBartConfig)
-AutoModel.register(UnimerSwinConfig, UnimerSwinModel)
-AutoModelForCausalLM.register(UnimerMBartConfig, UnimerMBartForCausalLM)
-# TODO: rewrite tokenizer
-class TokenizerWrapper:
-    def __init__(self, tokenizer):
-        self.tokenizer = tokenizer
-        self.pad_token_id = self.tokenizer.pad_token_id
-        self.bos_token_id = self.tokenizer.bos_token_id
-        self.eos_token_id = self.tokenizer.eos_token_id
-    def __len__(self):
-        return len(self.tokenizer)
-    def tokenize(self, text, **kwargs):
-        return self.tokenizer(
-            text,
-            return_token_type_ids=False,
-            return_tensors="pt",
-            padding="longest",
-            truncation=True,
-            **kwargs,
-        )
-    def token2str(self, tokens) -> list:
-        generated_text = self.tokenizer.batch_decode(tokens, skip_special_tokens=True)
-        generated_text = [fix_text(text) for text in generated_text]
-        return generated_text
-    def detokenize(self, tokens):
-        toks = [self.tokenizer.convert_ids_to_tokens(tok) for tok in tokens]
-        for b in range(len(toks)):
-            for i in reversed(range(len(toks[b]))):
-                if toks[b][i] is None:
-                    toks[b][i] = ''
-                toks[b][i] = toks[b][i].replace('Ġ', ' ').strip()
-                if toks[b][i] in ([self.tokenizer.bos_token, self.tokenizer.eos_token, self.tokenizer.pad_token]):
-                    del toks[b][i]
-        return toks
-LEFT_PATTERN = re.compile(r'(\\left)(\S*)')
-RIGHT_PATTERN = re.compile(r'(\\right)(\S*)')
-LEFT_COUNT_PATTERN = re.compile(r'\\left(?![a-zA-Z])')
-RIGHT_COUNT_PATTERN = re.compile(r'\\right(?![a-zA-Z])')
-LEFT_RIGHT_REMOVE_PATTERN = re.compile(r'\\left\.?|\\right\.?')
-def fix_latex_left_right(s):
-    """
-    修复LaTeX中的\\left和\\right命令
-    1. 确保它们后面跟有效分隔符
-    2. 平衡\\left和\\right的数量
-    """
-    # 白名单分隔符
-    valid_delims_list = [r'(', r')', r'[', r']', r'{', r'}', r'/', r'|',
-                         r'\{', r'\}', r'\lceil', r'\rceil', r'\lfloor',
-                         r'\rfloor', r'\backslash', r'\uparrow', r'\downarrow',
-                         r'\Uparrow', r'\Downarrow', r'\|', r'\.']
-    # 为\left后缺失有效分隔符的情况添加点
-    def fix_delim(match, is_left=True):
-        cmd = match.group(1)  # \left 或 \right
-        rest = match.group(2) if len(match.groups()) > 1 else ""
-        if not rest or rest not in valid_delims_list:
-            return cmd + "."
-        return match.group(0)
-    # 使用更精确的模式匹配\left和\right命令
-    # 确保它们是独立的命令，不是其他命令的一部分
-    # 使用预编译正则和统一回调函数
-    s = LEFT_PATTERN.sub(lambda m: fix_delim(m, True), s)
-    s = RIGHT_PATTERN.sub(lambda m: fix_delim(m, False), s)
-    # 更精确地计算\left和\right的数量
-    left_count = len(LEFT_COUNT_PATTERN.findall(s))  # 不匹配\lefteqn等
-    right_count = len(RIGHT_COUNT_PATTERN.findall(s))  # 不匹配\rightarrow等
-    if left_count == right_count:
-        # 如果数量相等，检查是否在同一组
-        return fix_left_right_pairs(s)
-    else:
-        # 如果数量不等，移除所有\left和\right
-        # logger.debug(f"latex:{s}")
-        # logger.warning(f"left_count: {left_count}, right_count: {right_count}")
-        return LEFT_RIGHT_REMOVE_PATTERN.sub('', s)
-def fix_left_right_pairs(latex_formula):
-    """
-    检测并修复LaTeX公式中\\left和\\right不在同一组的情况
-    Args:
-        latex_formula (str): 输入的LaTeX公式
-    Returns:
-        str: 修复后的LaTeX公式
-    """
-    # 用于跟踪花括号嵌套层级
-    brace_stack = []
-    # 用于存储\left信息: (位置, 深度, 分隔符)
-    left_stack = []
-    # 存储需要调整的\right信息: (开始位置, 结束位置, 目标位置)
-    adjustments = []
-    i = 0
-    while i < len(latex_formula):
-        # 检查是否是转义字符
-        if i > 0 and latex_formula[i - 1] == '\\':
-            backslash_count = 0
-            j = i - 1
-            while j >= 0 and latex_formula[j] == '\\':
-                backslash_count += 1
-                j -= 1
-            if backslash_count % 2 == 1:
-                i += 1
-                continue
-        # 检测\left命令
-        if i + 5 < len(latex_formula) and latex_formula[i:i + 5] == "\\left" and i + 5 < len(latex_formula):
-            delimiter = latex_formula[i + 5]
-            left_stack.append((i, len(brace_stack), delimiter))
-            i += 6  # 跳过\left和分隔符
-            continue
-        # 检测\right命令
-        elif i + 6 < len(latex_formula) and latex_formula[i:i + 6] == "\\right" and i + 6 < len(latex_formula):
-            delimiter = latex_formula[i + 6]
-            if left_stack:
-                left_pos, left_depth, left_delim = left_stack.pop()
-                # 如果\left和\right不在同一花括号深度
-                if left_depth != len(brace_stack):
-                    # 找到\left所在花括号组的结束位置
-                    target_pos = find_group_end(latex_formula, left_pos, left_depth)
-                    if target_pos != -1:
-                        # 记录需要移动的\right
-                        adjustments.append((i, i + 7, target_pos))
-            i += 7  # 跳过\right和分隔符
-            continue
-        # 处理花括号
-        if latex_formula[i] == '{':
-            brace_stack.append(i)
-        elif latex_formula[i] == '}':
-            if brace_stack:
-                brace_stack.pop()
-        i += 1
-    # 应用调整，从后向前处理以避免索引变化
-    if not adjustments:
-        return latex_formula
-    result = list(latex_formula)
-    adjustments.sort(reverse=True, key=lambda x: x[0])
-    for start, end, target in adjustments:
-        # 提取\right部分
-        right_part = result[start:end]
-        # 从原位置删除
-        del result[start:end]
-        # 在目标位置插入
-        result.insert(target, ''.join(right_part))
-    return ''.join(result)
-def find_group_end(text, pos, depth):
-    """查找特定深度的花括号组的结束位置"""
-    current_depth = depth
-    i = pos
-    while i < len(text):
-        if text[i] == '{' and (i == 0 or not is_escaped(text, i)):
-            current_depth += 1
-        elif text[i] == '}' and (i == 0 or not is_escaped(text, i)):
-            current_depth -= 1
-            if current_depth < depth:
-                return i
-        i += 1
-    return -1  # 未找到对应结束位置
-def is_escaped(text, pos):
-    """检查字符是否被转义"""
-    backslash_count = 0
-    j = pos - 1
-    while j >= 0 and text[j] == '\\':
-        backslash_count += 1
-        j -= 1
-    return backslash_count % 2 == 1
-def fix_unbalanced_braces(latex_formula):
-    """
-    检测LaTeX公式中的花括号是否闭合，并删除无法配对的花括号
-    Args:
-        latex_formula (str): 输入的LaTeX公式
-    Returns:
-        str: 删除无法配对的花括号后的LaTeX公式
-    """
-    stack = []  # 存储左括号的索引
-    unmatched = set()  # 存储不匹配括号的索引
-    i = 0
-    while i < len(latex_formula):
-        # 检查是否是转义的花括号
-        if latex_formula[i] in ['{', '}']:
-            # 计算前面连续的反斜杠数量
-            backslash_count = 0
-            j = i - 1
-            while j >= 0 and latex_formula[j] == '\\':
-                backslash_count += 1
-                j -= 1
-            # 如果前面有奇数个反斜杠，则该花括号是转义的，不参与匹配
-            if backslash_count % 2 == 1:
-                i += 1
-                continue
-            # 否则，该花括号参与匹配
-            if latex_formula[i] == '{':
-                stack.append(i)
-            else:  # latex_formula[i] == '}'
-                if stack:  # 有对应的左括号
-                    stack.pop()
-                else:  # 没有对应的左括号
-                    unmatched.add(i)
-        i += 1
-    # 所有未匹配的左括号
-    unmatched.update(stack)
-    # 构建新字符串，删除不匹配的括号
-    return ''.join(char for i, char in enumerate(latex_formula) if i not in unmatched)
-def process_latex(input_string):
-    """
-        处理LaTeX公式中的反斜杠：
-        1. 如果\后跟特殊字符(#$%&~_^\\{})或空格，保持不变
-        2. 如果\后跟两个小写字母，保持不变
-        3. 其他情况，在\后添加空格
-        Args:
-            input_string (str): 输入的LaTeX公式
-        Returns:
-            str: 处理后的LaTeX公式
-        """
-    def replace_func(match):
-        # 获取\后面的字符
-        next_char = match.group(1)
-        # 如果是特殊字符或空格，保持不变
-        if next_char in "#$%&~_^|\\{} \t\n\r\v\f":
-            return match.group(0)
-        # 如果是字母，检查下一个字符
-        if 'a' <= next_char <= 'z' or 'A' <= next_char <= 'Z':
-            pos = match.start() + 2  # \x后的位置
-            if pos < len(input_string) and ('a' <= input_string[pos] <= 'z' or 'A' <= input_string[pos] <= 'Z'):
-                # 下一个字符也是字母，保持不变
-                return match.group(0)
-        # 其他情况，在\后添加空格
-        return '\\' + ' ' + next_char
-    # 匹配\后面跟一个字符的情况
-    pattern = r'\\(.)'
-    return re.sub(pattern, replace_func, input_string)
-# 常见的在KaTeX/MathJax中可用的数学环境
-ENV_TYPES = ['array', 'matrix', 'pmatrix', 'bmatrix', 'vmatrix',
-             'Bmatrix', 'Vmatrix', 'cases', 'aligned', 'gathered']
-ENV_BEGIN_PATTERNS = {env: re.compile(r'\\begin\{' + env + r'\}') for env in ENV_TYPES}
-ENV_END_PATTERNS = {env: re.compile(r'\\end\{' + env + r'\}') for env in ENV_TYPES}
-ENV_FORMAT_PATTERNS = {env: re.compile(r'\\begin\{' + env + r'\}\{([^}]*)\}') for env in ENV_TYPES}
-def fix_latex_environments(s):
-    """
-    检测LaTeX中环境（如array）的\\begin和\\end是否匹配
-    1. 如果缺少\\begin标签则在开头添加
-    2. 如果缺少\\end标签则在末尾添加
-    """
-    for env in ENV_TYPES:
-        begin_count = len(ENV_BEGIN_PATTERNS[env].findall(s))
-        end_count = len(ENV_END_PATTERNS[env].findall(s))
-        if begin_count != end_count:
-            if end_count > begin_count:
-                format_match = ENV_FORMAT_PATTERNS[env].search(s)
-                default_format = '{c}' if env == 'array' else ''
-                format_str = '{' + format_match.group(1) + '}' if format_match else default_format
-                missing_count = end_count - begin_count
-                begin_command = '\\begin{' + env + '}' + format_str + ' '
-                s = begin_command * missing_count + s
-            else:
-                missing_count = begin_count - end_count
-                s = s + (' \\end{' + env + '}') * missing_count
-    return s
-UP_PATTERN = re.compile(r'\\up([a-zA-Z]+)')
-COMMANDS_TO_REMOVE_PATTERN = re.compile(
-    r'\\(?:lefteqn|boldmath|ensuremath|centering|textsubscript|sides|textsl|textcent|emph|protect|null)')
-REPLACEMENTS_PATTERNS = {
-    re.compile(r'\\underbar'): r'\\underline',
-    re.compile(r'\\Bar'): r'\\hat',
-    re.compile(r'\\Hat'): r'\\hat',
-    re.compile(r'\\Tilde'): r'\\tilde',
-    re.compile(r'\\slash'): r'/',
-    re.compile(r'\\textperthousand'): r'‰',
-    re.compile(r'\\sun'): r'☉',
-    re.compile(r'\\textunderscore'): r'\\_',
-    re.compile(r'\\fint'): r'⨏',
-    re.compile(r'\\up '): r'\\ ',
-    re.compile(r'\\vline = '): r'\\models ',
-    re.compile(r'\\vDash '): r'\\models ',
-    re.compile(r'\\sq \\sqcup '): r'\\square ',
-}
-QQUAD_PATTERN = re.compile(r'\\qquad(?!\s)')
-def latex_rm_whitespace(s: str):
-    """Remove unnecessary whitespace from LaTeX code."""
-    s = fix_unbalanced_braces(s)
-    s = fix_latex_left_right(s)
-    s = fix_latex_environments(s)
-    # 使用预编译的正则表达式
-    s = UP_PATTERN.sub(
-        lambda m: m.group(0) if m.group(1) in ["arrow", "downarrow", "lus", "silon"] else f"\\{m.group(1)}", s
-    )
-    s = COMMANDS_TO_REMOVE_PATTERN.sub('', s)
-    # 应用所有替换
-    for pattern, replacement in REPLACEMENTS_PATTERNS.items():
-        s = pattern.sub(replacement, s)
-    # 处理LaTeX中的反斜杠和空格
-    s = process_latex(s)
-    # \qquad后补空格
-    s = QQUAD_PATTERN.sub(r'\\qquad ', s)
-    return s
-class UnimernetModel(VisionEncoderDecoderModel):
-    def __init__(
-        self,
-        config: Optional[PretrainedConfig] = None,
-        encoder: Optional[PreTrainedModel] = None,
-        decoder: Optional[PreTrainedModel] = None,
-    ):
-        # VisionEncoderDecoderModel's checking log has bug, disable for temp.
-        base_model_logger.disabled = True
-        try:
-            super().__init__(config, encoder, decoder)
-        finally:
-            base_model_logger.disabled = False
-        if not config or not hasattr(config, "_name_or_path"):
-            raise RuntimeError("config._name_or_path is required by UnimernetModel.")
-        model_path = config._name_or_path
-        self.transform = UnimerSwinImageProcessor()
-        self.tokenizer = TokenizerWrapper(AutoTokenizer.from_pretrained(model_path))
-        self._post_check()
-    def _post_check(self):
-        tokenizer = self.tokenizer
-        if tokenizer.tokenizer.model_max_length != self.config.decoder.max_position_embeddings:
-            warnings.warn(
-                f"decoder.max_position_embeddings={self.config.decoder.max_position_embeddings}," +
-                f" but tokenizer.model_max_length={tokenizer.tokenizer.model_max_length}, will set" +
-                f" tokenizer.model_max_length to {self.config.decoder.max_position_embeddings}.")
-            tokenizer.tokenizer.model_max_length = self.config.decoder.max_position_embeddings
-        assert self.config.decoder.vocab_size == len(tokenizer)
-        assert self.config.decoder_start_token_id == tokenizer.bos_token_id
-        assert self.config.pad_token_id == tokenizer.pad_token_id
-    @classmethod
-    def from_checkpoint(cls, model_path: str, model_filename: str = "pytorch_model.pth", state_dict_strip_prefix="model.model."):
-        config = VisionEncoderDecoderConfig.from_pretrained(model_path)
-        config._name_or_path = model_path
-        config.encoder = UnimerSwinConfig(**vars(config.encoder))
-        config.decoder = UnimerMBartConfig(**vars(config.decoder))
-        encoder = UnimerSwinModel(config.encoder)
-        decoder = UnimerMBartForCausalLM(config.decoder)
-        model = cls(config, encoder, decoder)
-        # load model weights
-        model_file_path = os.path.join(model_path, model_filename)
-        checkpoint = torch.load(model_file_path, map_location="cpu", weights_only=True)
-        state_dict = checkpoint["model"] if "model" in checkpoint else checkpoint
-        if not state_dict:
-            raise RuntimeError("state_dict is empty.")
-        if state_dict_strip_prefix:
-            state_dict = {
-                k[len(state_dict_strip_prefix):] if k.startswith(state_dict_strip_prefix) else k: v
-                for k, v in state_dict.items()
-            }
-        missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
-        if len(unexpected_keys) > 0:
-            warnings.warn("Unexpected key(s) in state_dict: {}.".format(", ".join(f'"{k}"' for k in unexpected_keys)))
-        if len(missing_keys) > 0:
-            raise RuntimeError("Missing key(s) in state_dict: {}.".format(", ".join(f'"{k}"' for k in missing_keys)))
-        return model
-    def forward_bak(self, samples):
-        pixel_values, text = samples["image"], samples["text_input"]
-        text_inputs = self.tokenizer.tokenize(text).to(pixel_values.device)
-        decoder_input_ids, decoder_attention_mask = text_inputs["input_ids"], text_inputs["attention_mask"]
-        num_channels = pixel_values.shape[1]
-        if num_channels == 1:
-            pixel_values = pixel_values.repeat(1, 3, 1, 1)
-        labels = decoder_input_ids * 1
-        labels = labels.masked_fill(labels == self.tokenizer.pad_token_id, -100)
-        loss = self.model(
-            pixel_values=pixel_values,
-            decoder_input_ids=decoder_input_ids[:, :-1],
-            decoder_attention_mask=decoder_attention_mask[:, :-1],
-            labels=labels[:, 1:],
-        ).loss
-        return {"loss": loss}
-    def generate(self, samples, do_sample: bool = False, temperature: float = 0.2, top_p: float = 0.95):
-        pixel_values = samples["image"]
-        num_channels = pixel_values.shape[1]
-        if num_channels == 1:
-            pixel_values = pixel_values.repeat(1, 3, 1, 1)
-        kwargs = {}
-        if do_sample:
-            kwargs["temperature"] = temperature
-            kwargs["top_p"] = top_p
-        outputs = super().generate(
-            pixel_values=pixel_values,
-            max_new_tokens=self.tokenizer.tokenizer.model_max_length, # required
-            decoder_start_token_id=self.tokenizer.tokenizer.bos_token_id,
-            do_sample=do_sample,
-            **kwargs,
-        )
-        outputs = outputs[:, 1:].cpu().numpy()
-        pred_tokens = self.tokenizer.detokenize(outputs)
-        pred_str = self.tokenizer.token2str(outputs)
-        fixed_str = [latex_rm_whitespace(s) for s in pred_str]
-        return {"pred_ids": outputs, "pred_tokens": pred_tokens, "pred_str": pred_str, "fixed_str": fixed_str}
--- a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/__init__.py
+++ b/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/__init__.py
-from .configuration_unimer_mbart import UnimerMBartConfig
-from .modeling_unimer_mbart import UnimerMBartModel, UnimerMBartForCausalLM
-__all__ = [
-    "UnimerMBartConfig",
-    "UnimerMBartModel",
-    "UnimerMBartForCausalLM",
-]
--- a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/configuration_unimer_mbart.py
+++ b/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/configuration_unimer_mbart.py
-# coding=utf-8
-# Copyright 2021, The Facebook AI Research Team and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""UnimerMBART model configuration"""
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-logger = logging.get_logger(__name__)
-class UnimerMBartConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`MBartModel`]. It is used to instantiate an MBART
-    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
-    defaults will yield a similar configuration to that of the MBART
-    [facebook/mbart-large-cc25](https://huggingface.co/facebook/mbart-large-cc25) architecture.
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-    Args:
-        vocab_size (`int`, *optional*, defaults to 50265):
-            Vocabulary size of the MBART model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`MBartModel`] or [`TFMBartModel`].
-        d_model (`int`, *optional*, defaults to 1024):
-            Dimensionality of the layers and the pooler layer.
-        qk_squeeze (`int`, *optional*, defaults to 2):
-            Squeeze ratio for query/key's output dimension. See the [UniMERNet paper](https://arxiv.org/abs/2404.15254).
-            Squeeze Attention maps the query and key to a lower-dimensional space without excessive loss of information,
-            thereby accelerating the computation of attention.
-        encoder_layers (`int`, *optional*, defaults to 12):
-            Number of encoder layers.
-        decoder_layers (`int`, *optional*, defaults to 12):
-            Number of decoder layers.
-        encoder_attention_heads (`int`, *optional*, defaults to 16):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        decoder_attention_heads (`int`, *optional*, defaults to 16):
-            Number of attention heads for each attention layer in the Transformer decoder.
-        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
-            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
-            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
-        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"silu"` and `"gelu_new"` are supported.
-        dropout (`float`, *optional*, defaults to 0.1):
-            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        activation_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for activations inside the fully connected layer.
-        classifier_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for classifier.
-        max_position_embeddings (`int`, *optional*, defaults to 1024):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        init_std (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
-            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
-            for more details.
-        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
-            for more details.
-        scale_embedding (`bool`, *optional*, defaults to `False`):
-            Scale embeddings by diving by sqrt(d_model).
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models)
-        forced_eos_token_id (`int`, *optional*, defaults to 2):
-            The id of the token to force as the last generated token when `max_length` is reached. Usually set to
-            `eos_token_id`.
-    Example:
-    ```python
-    >>> from transformers import MBartConfig, MBartModel
-    >>> # Initializing a MBART facebook/mbart-large-cc25 style configuration
-    >>> configuration = MBartConfig()
-    >>> # Initializing a model (with random weights) from the facebook/mbart-large-cc25 style configuration
-    >>> model = MBartModel(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "unimer-mbart"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
-    def __init__(
-        self,
-        vocab_size=50265,
-        max_position_embeddings=1024,
-        encoder_layers=12,
-        encoder_ffn_dim=4096,
-        encoder_attention_heads=16,
-        decoder_layers=12,
-        decoder_ffn_dim=4096,
-        decoder_attention_heads=16,
-        encoder_layerdrop=0.0,
-        decoder_layerdrop=0.0,
-        use_cache=True,
-        is_encoder_decoder=True,
-        activation_function="gelu",
-        d_model=1024,
-        qk_squeeze=2,
-        dropout=0.1,
-        attention_dropout=0.0,
-        activation_dropout=0.0,
-        init_std=0.02,
-        classifier_dropout=0.0,
-        scale_embedding=False,
-        pad_token_id=1,
-        bos_token_id=0,
-        eos_token_id=2,
-        forced_eos_token_id=2,
-        **kwargs,
-    ):
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.d_model = d_model
-        self.qk_squeeze = qk_squeeze
-        self.encoder_ffn_dim = encoder_ffn_dim
-        self.encoder_layers = encoder_layers
-        self.encoder_attention_heads = encoder_attention_heads
-        self.decoder_ffn_dim = decoder_ffn_dim
-        self.decoder_layers = decoder_layers
-        self.decoder_attention_heads = decoder_attention_heads
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.activation_dropout = activation_dropout
-        self.activation_function = activation_function
-        self.init_std = init_std
-        self.encoder_layerdrop = encoder_layerdrop
-        self.decoder_layerdrop = decoder_layerdrop
-        self.classifier_dropout = classifier_dropout
-        self.use_cache = use_cache
-        self.num_hidden_layers = encoder_layers
-        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            is_encoder_decoder=is_encoder_decoder,
-            forced_eos_token_id=forced_eos_token_id,
-            **kwargs,
-        )
--- a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/modeling_unimer_mbart.py
+++ b/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/modeling_unimer_mbart.py
-# coding=utf-8
-# Copyright 2021, The Facebook AI Research Team and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch UnimerMBART model."""
-import copy
-import math
-from dataclasses import dataclass
-from typing import List, Optional, Tuple, Union
-import torch
-import torch.nn.functional as F
-import torch.utils.checkpoint
-from torch import nn
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
-from transformers.activations import ACT2FN
-from transformers.modeling_attn_mask_utils import (
-    _prepare_4d_attention_mask,
-    _prepare_4d_attention_mask_for_sdpa,
-    _prepare_4d_causal_attention_mask,
-    _prepare_4d_causal_attention_mask_for_sdpa,
-)
-from transformers.modeling_outputs import (
-    BaseModelOutput,
-    BaseModelOutputWithPastAndCrossAttentions,
-    CausalLMOutputWithCrossAttentions,
-    Seq2SeqLMOutput,
-    Seq2SeqModelOutput,
-    Seq2SeqQuestionAnsweringModelOutput,
-    Seq2SeqSequenceClassifierOutput,
-)
-from transformers import GenerationMixin, PreTrainedModel
-from transformers.utils import (
-    add_code_sample_docstrings,
-    add_end_docstrings,
-    add_start_docstrings,
-    add_start_docstrings_to_model_forward,
-    is_flash_attn_2_available,
-    is_flash_attn_greater_or_equal_2_10,
-    logging,
-    replace_return_docstrings,
-)
-from .configuration_unimer_mbart import UnimerMBartConfig
-if is_flash_attn_2_available():
-    from flash_attn import flash_attn_func, flash_attn_varlen_func
-    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-logger = logging.get_logger(__name__)
-_CHECKPOINT_FOR_DOC = "facebook/mbart-large-cc25"
-_CONFIG_FOR_DOC = "MBartConfig"
-# Base model docstring
-_EXPECTED_OUTPUT_SHAPE = [1, 8, 1024]
-# Copied from transformers.models.llama.modeling_llama._get_unpad_data
-def _get_unpad_data(attention_mask):
-    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
-    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
-    max_seqlen_in_batch = seqlens_in_batch.max().item()
-    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
-    return (
-        indices,
-        cu_seqlens,
-        max_seqlen_in_batch,
-    )
-def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int):
-    """
-    Shift input ids one token to the right, and wrap the last non pad token (the <LID> token) Note that MBart does not
-    have a single `decoder_start_token_id` in contrast to other Bart-like models.
-    """
-    prev_output_tokens = input_ids.clone()
-    if pad_token_id is None:
-        raise ValueError("self.model.config.pad_token_id has to be defined.")
-    # replace possible -100 values in labels by `pad_token_id`
-    prev_output_tokens.masked_fill_(prev_output_tokens == -100, pad_token_id)
-    index_of_eos = (prev_output_tokens.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)
-    decoder_start_tokens = prev_output_tokens.gather(1, index_of_eos).squeeze()
-    prev_output_tokens[:, 1:] = prev_output_tokens[:, :-1].clone()
-    prev_output_tokens[:, 0] = decoder_start_tokens
-    return prev_output_tokens
-@dataclass
-class CausalLMOutputWithCrossAttentionsAndCounting(CausalLMOutputWithCrossAttentions):
-    """
-    Base class for causal language model (or autoregressive) outputs.
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Language modeling loss (for next-token prediction).
-        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-            Cross attentions weights after the attention softmax, used to compute the weighted average in the
-            cross-attention heads.
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `torch.FloatTensor` tuples of length `config.n_layers`, with each tuple containing the cached key,
-            value states of the self-attention and the cross-attention layers if model is used in encoder-decoder
-            setting. Only relevant if `config.is_decoder = True`.
-            Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see
-            `past_key_values` input) to speed up sequential decoding.
-        counting:
-            Counting
-    """
-    counting: Optional[torch.FloatTensor] = None
-# Copied from transformers.models.bart.modeling_bart.BartLearnedPositionalEmbedding with Bart->MBart
-class UnimerMBartLearnedPositionalEmbedding(nn.Embedding):
-    """
-    This module learns positional embeddings up to a fixed maximum size.
-    """
-    def __init__(self, num_embeddings: int, embedding_dim: int):
-        # MBart is set up so that if padding_idx is specified then offset the embedding ids by 2
-        # and adjust num_embeddings appropriately. Other models don't have this hack
-        self.offset = 2
-        super().__init__(num_embeddings + self.offset, embedding_dim)
-    def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
-        """`input_ids' shape is expected to be [bsz x seqlen]."""
-        bsz, seq_len = input_ids.shape[:2]
-        positions = torch.arange(
-            past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
-        ).expand(bsz, -1)
-        return super().forward(positions + self.offset)
-# Copied from transformers.models.bart.modeling_bart.BartScaledWordEmbedding with Bart->MBart
-class UnimerMBartScaledWordEmbedding(nn.Embedding):
-    """
-    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
-    """
-    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: Optional[float] = 1.0):
-        super().__init__(num_embeddings, embedding_dim, padding_idx)
-        self.embed_scale = embed_scale
-    def forward(self, input_ids: torch.Tensor):
-        return super().forward(input_ids) * self.embed_scale
-# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->MBart
-class UnimerMBartAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper, with qk_squeeze"""
-    def __init__(
-        self,
-        embed_dim: int,
-        num_heads: int,
-        dropout: float = 0.0,
-        is_decoder: bool = False,
-        bias: bool = True,
-        is_causal: bool = False,
-        *,
-        config: UnimerMBartConfig,
-    ):
-        super().__init__()
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.dropout = dropout
-        self.head_dim = embed_dim // num_heads
-        self.config = config
-        if (self.head_dim * num_heads) != self.embed_dim:
-            raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
-                f" and `num_heads`: {num_heads})."
-            )
-        self.squeeze_dim = embed_dim // config.qk_squeeze
-        self.squeeze_head_dim = self.squeeze_dim // num_heads
-        self.scaling = self.squeeze_head_dim**-0.5
-        self.is_decoder = is_decoder
-        self.is_causal = is_causal
-        self.q_proj = nn.Linear(embed_dim, self.squeeze_dim, bias=bias)
-        self.k_proj = nn.Linear(embed_dim, self.squeeze_dim, bias=bias)
-        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-    def _shape_qk(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.squeeze_head_dim).transpose(1, 2).contiguous()
-    def _shape_v(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        key_value_states: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        layer_head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        """Input shape: Batch x Time x Channel"""
-        # if key_value_states are provided this layer is used as a cross-attention layer
-        # for the decoder
-        is_cross_attention = key_value_states is not None
-        bsz, tgt_len, _ = hidden_states.size()
-        # get query proj
-        query_states = self.q_proj(hidden_states) * self.scaling
-        # get key, value proj
-        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
-        # is checking that the `sequence_length` of the `past_key_value` is the same as
-        # the provided `key_value_states` to support prefix tuning
-        if (
-            is_cross_attention
-            and past_key_value is not None
-            and past_key_value[0].shape[2] == key_value_states.shape[1]
-        ):
-            # reuse k,v, cross_attentions
-            key_states = past_key_value[0]
-            value_states = past_key_value[1]
-        elif is_cross_attention:
-            # cross_attentions
-            key_states = self._shape_qk(self.k_proj(key_value_states), -1, bsz)
-            value_states = self._shape_v(self.v_proj(key_value_states), -1, bsz)
-        elif past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = self._shape_qk(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape_v(self.v_proj(hidden_states), -1, bsz)
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-        else:
-            # self_attention
-            key_states = self._shape_qk(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape_v(self.v_proj(hidden_states), -1, bsz)
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_states, value_states)
-        proj_shape = (bsz * self.num_heads, -1, self.squeeze_head_dim)
-        value_shape = (bsz * self.num_heads, -1, self.head_dim)
-        query_states = self._shape_qk(query_states, tgt_len, bsz).view(*proj_shape)
-        key_states = key_states.reshape(*proj_shape)
-        value_states = value_states.reshape(*value_shape)
-        src_len = key_states.size(1)
-        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
-        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
-                )
-            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
-        if layer_head_mask is not None:
-            if layer_head_mask.size() != (self.num_heads,):
-                raise ValueError(
-                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
-                    f" {layer_head_mask.size()}"
-                )
-            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-        if output_attentions:
-            # this operation is a bit awkward, but it's required to
-            # make sure that attn_weights keeps its gradient.
-            # In order to do so, attn_weights have to be reshaped
-            # twice and have to be reused in the following
-            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
-        else:
-            attn_weights_reshaped = None
-        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
-        attn_output = torch.bmm(attn_probs, value_states)
-        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
-        attn_output = attn_output.transpose(1, 2)
-        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned across GPUs when using tensor-parallelism.
-        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
-        attn_output = self.out_proj(attn_output)
-        return attn_output, attn_weights_reshaped, past_key_value
-# Copied from transformers.models.bart.modeling_bart.BartFlashAttention2 with Bart->MBart
-class UnimerMBartFlashAttention2(UnimerMBartAttention):
-    """
-    MBart flash attention module. This module inherits from `MBartSqueezeAttention` as the weights of the module stays
-    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
-    flash attention and deal with padding tokens in case the input contains any of them.
-    """
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
-        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
-        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
-    # def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-    #     return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
-    def _shape_qk(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.squeeze_head_dim)
-    def _shape_v(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        key_value_states: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        layer_head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        # MBartFlashAttention2 attention does not support output_attentions
-        if output_attentions:
-            raise ValueError("MBartFlashAttention2 attention does not support output_attentions")
-        # if key_value_states are provided this layer is used as a cross-attention layer
-        # for the decoder
-        is_cross_attention = key_value_states is not None
-        bsz, q_len, _ = hidden_states.size()
-        # get query proj
-        query_states = self._shape_qk(self.q_proj(hidden_states), -1, bsz)
-        # get key, value proj
-        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
-        # is checking that the `sequence_length` of the `past_key_value` is the same as
-        # the provided `key_value_states` to support prefix tuning
-        if (
-            is_cross_attention
-            and past_key_value is not None
-            and past_key_value[0].shape[2] == key_value_states.shape[1]
-        ):
-            # reuse k,v, cross_attentions
-            key_states = past_key_value[0].transpose(1, 2)
-            value_states = past_key_value[1].transpose(1, 2)
-        elif is_cross_attention:
-            # cross_attentions
-            key_states = self._shape_qk(self.k_proj(key_value_states), -1, bsz)
-            value_states = self._shape_v(self.v_proj(key_value_states), -1, bsz)
-        elif past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = self._shape_qk(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape_v(self.v_proj(hidden_states), -1, bsz)
-            key_states = torch.cat([past_key_value[0].transpose(1, 2), key_states], dim=1)
-            value_states = torch.cat([past_key_value[1].transpose(1, 2), value_states], dim=1)
-        else:
-            # self_attention
-            key_states = self._shape_qk(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape_v(self.v_proj(hidden_states), -1, bsz)
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_states.transpose(1, 2), value_states.transpose(1, 2))
-        kv_seq_len = key_states.shape[-2]
-        if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in the correct dtype just to be sure everything works as expected.
-        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
-        # in fp32. (LlamaRMSNorm handles it correctly)
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
-            )
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
-        attn_output = self._flash_attention_forward(
-            query_states, key_states, value_states, attention_mask, q_len, dropout=self.dropout
-        )
-        attn_output = attn_output.reshape(bsz, q_len, -1)
-        attn_output = self.out_proj(attn_output)
-        if not output_attentions:
-            attn_weights = None
-        return attn_output, attn_weights, past_key_value
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
-    def _flash_attention_forward(
-        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
-    ):
-        """
-        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
-        first unpad the input, then computes the attention scores and pad the final attention scores.
-        Args:
-            query_states (`torch.Tensor`):
-                Input query states to be passed to Flash Attention API
-            key_states (`torch.Tensor`):
-                Input key states to be passed to Flash Attention API
-            value_states (`torch.Tensor`):
-                Input value states to be passed to Flash Attention API
-            attention_mask (`torch.Tensor`):
-                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
-                position of padding tokens and 1 for the position of non-padding tokens.
-            dropout (`float`):
-                Attention dropout
-            softmax_scale (`float`, *optional*):
-                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
-        """
-        if not self._flash_attn_uses_top_left_mask:
-            causal = self.is_causal
-        else:
-            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
-            causal = self.is_causal and query_length != 1
-        # Contains at least one padding token in the sequence
-        if attention_mask is not None:
-            batch_size = query_states.shape[0]
-            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
-                query_states, key_states, value_states, attention_mask, query_length
-            )
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
-            attn_output_unpad = flash_attn_varlen_func(
-                query_states,
-                key_states,
-                value_states,
-                cu_seqlens_q=cu_seqlens_q,
-                cu_seqlens_k=cu_seqlens_k,
-                max_seqlen_q=max_seqlen_in_batch_q,
-                max_seqlen_k=max_seqlen_in_batch_k,
-                dropout_p=dropout,
-                softmax_scale=softmax_scale,
-                causal=causal,
-            )
-            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
-        else:
-            attn_output = flash_attn_func(
-                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
-            )
-        return attn_output
-    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input
-    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
-        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
-        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
-        key_layer = index_first_axis(
-            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
-        )
-        value_layer = index_first_axis(
-            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
-        )
-        if query_length == kv_seq_len:
-            query_layer = index_first_axis(
-                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
-            )
-            cu_seqlens_q = cu_seqlens_k
-            max_seqlen_in_batch_q = max_seqlen_in_batch_k
-            indices_q = indices_k
-        elif query_length == 1:
-            max_seqlen_in_batch_q = 1
-            cu_seqlens_q = torch.arange(
-                batch_size + 1, dtype=torch.int32, device=query_layer.device
-            )  # There is a memcpy here, that is very bad.
-            indices_q = cu_seqlens_q[:-1]
-            query_layer = query_layer.squeeze(1)
-        else:
-            # The -q_len: slice assumes left padding.
-            attention_mask = attention_mask[:, -query_length:]
-            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
-        return (
-            query_layer,
-            key_layer,
-            value_layer,
-            indices_q,
-            (cu_seqlens_q, cu_seqlens_k),
-            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
-        )
-class UnimerMBartSdpaAttention(UnimerMBartAttention):
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        key_value_states: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        layer_head_mask: Optional[torch.Tensor] = None,
-        output_attentions: bool = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        """Input shape: Batch x Time x Channel"""
-        if output_attentions or layer_head_mask is not None:
-            # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once this is implemented.
-            logger.warning(
-                "BartModel is using BartSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention"
-                ' implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
-            )
-            return super().forward(
-                hidden_states,
-                key_value_states=key_value_states,
-                past_key_value=past_key_value,
-                attention_mask=attention_mask,
-                layer_head_mask=layer_head_mask,
-                output_attentions=output_attentions,
-            )
-        # if key_value_states are provided this layer is used as a cross-attention layer
-        # for the decoder
-        is_cross_attention = key_value_states is not None
-        bsz, tgt_len, _ = hidden_states.size()
-        # get query proj
-        query_states = self.q_proj(hidden_states)
-        # get key, value proj
-        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
-        # is checking that the `sequence_length` of the `past_key_value` is the same as
-        # the provided `key_value_states` to support prefix tuning
-        if (
-            is_cross_attention
-            and past_key_value is not None
-            and past_key_value[0].shape[2] == key_value_states.shape[1]
-        ):
-            # reuse k,v, cross_attentions
-            key_states = past_key_value[0]
-            value_states = past_key_value[1]
-        elif is_cross_attention:
-            # cross_attentions
-            key_states = self._shape_qk(self.k_proj(key_value_states), -1, bsz)
-            value_states = self._shape_v(self.v_proj(key_value_states), -1, bsz)
-        elif past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = self._shape_qk(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape_v(self.v_proj(hidden_states), -1, bsz)
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-        else:
-            # self_attention
-            key_states = self._shape_qk(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape_v(self.v_proj(hidden_states), -1, bsz)
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_states, value_states)
-        query_states = self._shape_qk(query_states, tgt_len, bsz)
-        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
-        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
-        # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
-        is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False
-        # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
-        # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_mask=attention_mask,
-            dropout_p=self.dropout if self.training else 0.0,
-            is_causal=is_causal,
-        )
-        if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-        attn_output = attn_output.transpose(1, 2)
-        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned across GPUs when using tensor-parallelism.
-        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
-        attn_output = self.out_proj(attn_output)
-        return attn_output, None, past_key_value
-UNIMER_MBART_ATTENTION_CLASSES = {
-    "eager": UnimerMBartAttention,
-    "flash_attention_2": UnimerMBartFlashAttention2,
-    "sdpa": UnimerMBartSdpaAttention,
-}
-class UnimerMBartEncoderLayer(nn.Module):
-    def __init__(self, config: UnimerMBartConfig):
-        super().__init__()
-        self.embed_dim = config.d_model
-        self.self_attn = UNIMER_MBART_ATTENTION_CLASSES[config._attn_implementation](
-            embed_dim=self.embed_dim,
-            num_heads=config.encoder_attention_heads,
-            dropout=config.attention_dropout,
-            config=config,
-        )
-        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.dropout = config.dropout
-        self.activation_fn = ACT2FN[config.activation_function]
-        self.activation_dropout = config.activation_dropout
-        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
-        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
-        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: torch.Tensor,
-        layer_head_mask: torch.Tensor,
-        output_attentions: bool = False,
-    ) -> torch.Tensor:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
-                `(encoder_attention_heads,)`.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-        """
-        residual = hidden_states
-        hidden_states = self.self_attn_layer_norm(hidden_states)
-        hidden_states, attn_weights, _ = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            layer_head_mask=layer_head_mask,
-            output_attentions=output_attentions,
-        )
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = residual + hidden_states
-        residual = hidden_states
-        hidden_states = self.final_layer_norm(hidden_states)
-        hidden_states = self.activation_fn(self.fc1(hidden_states))
-        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
-        hidden_states = self.fc2(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = residual + hidden_states
-        if hidden_states.dtype == torch.float16 and (
-            torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any()
-        ):
-            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
-            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
-        outputs = (hidden_states,)
-        if output_attentions:
-            outputs += (attn_weights,)
-        return outputs
-class UnimerMBartDecoderLayer(nn.Module):
-    def __init__(self, config: UnimerMBartConfig):
-        super().__init__()
-        self.embed_dim = config.d_model
-        self.self_attn = UNIMER_MBART_ATTENTION_CLASSES[config._attn_implementation](
-            embed_dim=self.embed_dim,
-            num_heads=config.decoder_attention_heads,
-            dropout=config.attention_dropout,
-            is_decoder=True,
-            is_causal=True,
-            config=config,
-        )
-        self.dropout = config.dropout
-        self.activation_fn = ACT2FN[config.activation_function]
-        self.activation_dropout = config.activation_dropout
-        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.encoder_attn = UNIMER_MBART_ATTENTION_CLASSES[config._attn_implementation](
-            self.embed_dim,
-            config.decoder_attention_heads,
-            dropout=config.attention_dropout,
-            is_decoder=True,
-            config=config,
-        )
-        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
-        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
-        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.Tensor] = None,
-        layer_head_mask: Optional[torch.Tensor] = None,
-        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        output_attentions: Optional[bool] = False,
-        use_cache: Optional[bool] = True,
-    ) -> torch.Tensor:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
-            attention_mask (`torch.FloatTensor`): attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            encoder_hidden_states (`torch.FloatTensor`):
-                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
-            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
-                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
-            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
-                `(encoder_attention_heads,)`.
-            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
-                size `(decoder_attention_heads,)`.
-            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-        """
-        residual = hidden_states
-        hidden_states = self.self_attn_layer_norm(hidden_states)
-        # Self Attention
-        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
-        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
-        # add present self-attn cache to positions 1,2 of present_key_value tuple
-        hidden_states, self_attn_weights, present_key_value = self.self_attn(
-            hidden_states=hidden_states,
-            past_key_value=self_attn_past_key_value,
-            attention_mask=attention_mask,
-            layer_head_mask=layer_head_mask,
-            output_attentions=output_attentions,
-        )
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = residual + hidden_states
-        # Cross-Attention Block
-        cross_attn_present_key_value = None
-        cross_attn_weights = None
-        if encoder_hidden_states is not None:
-            residual = hidden_states
-            hidden_states = self.encoder_attn_layer_norm(hidden_states)
-            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
-            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
-            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
-                hidden_states=hidden_states,
-                key_value_states=encoder_hidden_states,
-                attention_mask=encoder_attention_mask,
-                layer_head_mask=cross_attn_layer_head_mask,
-                past_key_value=cross_attn_past_key_value,
-                output_attentions=output_attentions,
-            )
-            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-            hidden_states = residual + hidden_states
-            # add cross-attn to positions 3,4 of present_key_value tuple
-            present_key_value = present_key_value + cross_attn_present_key_value
-        # Fully Connected
-        residual = hidden_states
-        hidden_states = self.final_layer_norm(hidden_states)
-        hidden_states = self.activation_fn(self.fc1(hidden_states))
-        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
-        hidden_states = self.fc2(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        hidden_states = residual + hidden_states
-        outputs = (hidden_states,)
-        if output_attentions:
-            outputs += (self_attn_weights, cross_attn_weights)
-        if use_cache:
-            outputs += (present_key_value,)
-        return outputs
-# Copied from transformers.models.bart.modeling_bart.BartClassificationHead with Bart->MBart
-class UnimerMBartClassificationHead(nn.Module):
-    """Head for sentence-level classification tasks."""
-    def __init__(
-        self,
-        input_dim: int,
-        inner_dim: int,
-        num_classes: int,
-        pooler_dropout: float,
-    ):
-        super().__init__()
-        self.dense = nn.Linear(input_dim, inner_dim)
-        self.dropout = nn.Dropout(p=pooler_dropout)
-        self.out_proj = nn.Linear(inner_dim, num_classes)
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.dense(hidden_states)
-        hidden_states = torch.tanh(hidden_states)
-        hidden_states = self.dropout(hidden_states)
-        hidden_states = self.out_proj(hidden_states)
-        return hidden_states
-class UnimerMBartPreTrainedModel(PreTrainedModel):
-    config_class = UnimerMBartConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["MBartDecoderLayer", "MBartSqueezeAttention"]
-    _supports_flash_attn_2 = True
-    _supports_sdpa = True
-    def _init_weights(self, module):
-        std = self.config.init_std
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-    @property
-    def dummy_inputs(self):
-        pad_token = self.config.pad_token_id
-        input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device)
-        dummy_inputs = {
-            "attention_mask": input_ids.ne(pad_token),
-            "input_ids": input_ids,
-        }
-        return dummy_inputs
-MBART_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-    Parameters:
-        config ([`MBartConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-MBART_GENERATION_EXAMPLE = r"""
-    Translation example:
-    ```python
-    >>> from transformers import AutoTokenizer, MBartForConditionalGeneration
-    >>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
-    >>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-en-ro")
-    >>> example_english_phrase = "42 is the answer"
-    >>> inputs = tokenizer(example_english_phrase, return_tensors="pt")
-    >>> # Translate
-    >>> generated_ids = model.generate(**inputs, num_beams=4, max_length=5)
-    >>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-    '42 este răspuns'
-    ```
-    Mask filling example:
-    ```python
-    >>> from transformers import AutoTokenizer, MBartForConditionalGeneration
-    >>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-cc25")
-    >>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-cc25")
-    >>> # de_DE is the language symbol id <LID> for German
-    >>> TXT = "</s> Meine Freunde sind <mask> nett aber sie essen zu viel Kuchen. </s> de_DE"
-    >>> input_ids = tokenizer([TXT], add_special_tokens=False, return_tensors="pt")["input_ids"]
-    >>> logits = model(input_ids).logits
-    >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
-    >>> probs = logits[0, masked_index].softmax(dim=0)
-    >>> values, predictions = probs.topk(5)
-    >>> tokenizer.decode(predictions).split()
-    ['nett', 'sehr', 'ganz', 'nicht', 'so']
-    ```
-"""
-MBART_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-            [What are attention masks?](../glossary#attention-mask)
-        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
-            Indices of decoder input sequence tokens in the vocabulary.
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-            [What are decoder input IDs?](../glossary#decoder-input-ids)
-            MBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
-            varies according to source and target language, *e.g.* 25004 for *en_XX*, and 25003 for *de_DE*. If
-            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-            For translation and summarization training, `decoder_input_ids` should be provided. If no
-            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
-            for denoising pre-training following the paper.
-        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
-            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
-            be used by default.
-        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
-            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
-            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
-            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
-            1]`:
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
-            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
-            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
-            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
-            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-            than the model's internal embedding lookup matrix.
-        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
-            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
-            input (see `past_key_values`). This is useful if you want more control over how to convert
-            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
-            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
-            of `inputs_embeds`.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-class UnimerMBartEncoder(UnimerMBartPreTrainedModel):
-    """
-    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
-    [`MBartEncoderLayer`].
-    Args:
-        config: MBartConfig
-        embed_tokens (nn.Embedding): output embedding
-    """
-    def __init__(self, config: UnimerMBartConfig, embed_tokens: Optional[nn.Embedding] = None):
-        super().__init__(config)
-        self.dropout = config.dropout
-        self.layerdrop = config.encoder_layerdrop
-        embed_dim = config.d_model
-        self.padding_idx = config.pad_token_id
-        self.max_source_positions = config.max_position_embeddings
-        embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
-        self.embed_tokens = UnimerMBartScaledWordEmbedding(
-            config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale
-        )
-        if embed_tokens is not None:
-            self.embed_tokens.weight = embed_tokens.weight
-        self.embed_positions = UnimerMBartLearnedPositionalEmbedding(
-            config.max_position_embeddings,
-            embed_dim,
-        )
-        self.layers = nn.ModuleList([UnimerMBartEncoderLayer(config) for _ in range(config.encoder_layers)])
-        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
-        self._use_sdpa = config._attn_implementation == "sdpa"
-        self.layernorm_embedding = nn.LayerNorm(embed_dim)
-        self.layer_norm = nn.LayerNorm(config.d_model)
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-    def _backward_compatibility_gradient_checkpointing(self):
-        # Override to not delete the attribute from the config
-        if self.supports_gradient_checkpointing and getattr(self.config, "gradient_checkpointing", False):
-            self.gradient_checkpointing_enable()
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutput]:
-        r"""
-        Args:
-            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
-                provide it.
-                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-                [`PreTrainedTokenizer.__call__`] for details.
-                [What are input IDs?](../glossary#input-ids)
-            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-                [What are attention masks?](../glossary#attention-mask)
-            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
-                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
-                - 1 indicates the head is **not masked**,
-                - 0 indicates the head is **masked**.
-            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
-                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-                than the model's internal embedding lookup matrix.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            output_hidden_states (`bool`, *optional*):
-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                for more detail.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
-        elif input_ids is not None:
-            input = input_ids
-            input_shape = input.shape
-            input_ids = input_ids.view(-1, input_shape[-1])
-        elif inputs_embeds is not None:
-            input = inputs_embeds[:, :, -1]
-        else:
-            raise ValueError("You have to specify either input_ids or inputs_embeds")
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-        embed_pos = self.embed_positions(input)
-        hidden_states = inputs_embeds + embed_pos.to(inputs_embeds.device)
-        hidden_states = self.layernorm_embedding(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        # expand attention_mask
-        if attention_mask is not None:
-            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            if self._use_flash_attention_2:
-                attention_mask = attention_mask if 0 in attention_mask else None
-            elif self._use_sdpa and head_mask is None and not output_attentions:
-                # output_attentions=True & head_mask can not be supported when using SDPA, fall back to
-                # the manual implementation that requires a 4D causal mask in all cases.
-                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-                attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype)
-            else:
-                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-                attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
-        encoder_states = () if output_hidden_states else None
-        all_attentions = () if output_attentions else None
-        # check if head_mask has a correct number of layers specified if desired
-        if head_mask is not None:
-            if head_mask.size()[0] != len(self.layers):
-                raise ValueError(
-                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
-                    f" {head_mask.size()[0]}."
-                )
-        for idx, encoder_layer in enumerate(self.layers):
-            if output_hidden_states:
-                encoder_states = encoder_states + (hidden_states,)
-            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            to_drop = False
-            if self.training:
-                dropout_probability = torch.rand([])
-                if dropout_probability < self.layerdrop:  # skip the layer
-                    to_drop = True
-            if to_drop:
-                layer_outputs = (None, None)
-            else:
-                if self.gradient_checkpointing and self.training:
-                    layer_outputs = self._gradient_checkpointing_func(
-                        encoder_layer.__call__,
-                        hidden_states,
-                        attention_mask,
-                        (head_mask[idx] if head_mask is not None else None),
-                        output_attentions,
-                    )
-                else:
-                    layer_outputs = encoder_layer(
-                        hidden_states,
-                        attention_mask,
-                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
-                        output_attentions=output_attentions,
-                    )
-                hidden_states = layer_outputs[0]
-            if output_attentions:
-                all_attentions = all_attentions + (layer_outputs[1],)
-        hidden_states = self.layer_norm(hidden_states)
-        if output_hidden_states:
-            encoder_states = encoder_states + (hidden_states,)
-        if not return_dict:
-            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
-        return BaseModelOutput(
-            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
-        )
-class UnimerMBartDecoder(UnimerMBartPreTrainedModel):
-    """
-    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MBartDecoderLayer`]
-    Args:
-        config: MBartConfig
-        embed_tokens (nn.Embedding): output embedding
-    """
-    def __init__(self, config: UnimerMBartConfig, embed_tokens: Optional[nn.Embedding] = None):
-        super().__init__(config)
-        self.dropout = config.dropout
-        self.layerdrop = config.decoder_layerdrop
-        self.padding_idx = config.pad_token_id
-        self.max_target_positions = config.max_position_embeddings
-        embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
-        self.embed_tokens = UnimerMBartScaledWordEmbedding(
-            config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale
-        )
-        if embed_tokens is not None:
-            self.embed_tokens.weight = embed_tokens.weight
-        self.embed_positions = UnimerMBartLearnedPositionalEmbedding(
-            config.max_position_embeddings,
-            config.d_model,
-        )
-        self.layers = nn.ModuleList([UnimerMBartDecoderLayer(config) for _ in range(config.decoder_layers)])
-        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
-        self._use_sdpa = config._attn_implementation == "sdpa"
-        self.layernorm_embedding = nn.LayerNorm(config.d_model)
-        self.layer_norm = nn.LayerNorm(config.d_model)
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-    def get_input_embeddings(self):
-        return self.embed_tokens
-    def set_input_embeddings(self, value):
-        self.embed_tokens = value
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        count_pred: Optional[torch.FloatTensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        cross_attn_head_mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
-        r"""
-        Args:
-            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
-                provide it.
-                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-                [`PreTrainedTokenizer.__call__`] for details.
-                [What are input IDs?](../glossary#input-ids)
-            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-                [What are attention masks?](../glossary#attention-mask)
-            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
-                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
-                of the decoder.
-            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
-                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
-                selected in `[0, 1]`:
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-                [What are attention masks?](../glossary#attention-mask)
-            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
-                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
-                - 1 indicates the head is **not masked**,
-                - 0 indicates the head is **masked**.
-            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
-                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
-                cross-attention on hidden heads. Mask values selected in `[0, 1]`:
-                - 1 indicates the head is **not masked**,
-                - 0 indicates the head is **masked**.
-            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
-                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
-                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
-                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
-                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
-                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-                than the model's internal embedding lookup matrix.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            output_hidden_states (`bool`, *optional*):
-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                for more detail.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            input = input_ids
-            input_shape = input.size()
-            input_ids = input_ids.view(-1, input_shape[-1])
-        elif inputs_embeds is not None:
-            input_shape = inputs_embeds.size()[:-1]
-            input = inputs_embeds[:, :, -1]
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-        # past_key_values_length
-        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
-        if inputs_embeds is None:
-            inputs_embeds = self.embed_tokens(input_ids)
-        if self._use_flash_attention_2:
-            # 2d mask is passed through the layers
-            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
-        elif self._use_sdpa and not output_attentions and cross_attn_head_mask is None:
-            # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
-            # the manual implementation that requires a 4D causal mask in all cases.
-            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                attention_mask,
-                input_shape,
-                inputs_embeds,
-                past_key_values_length,
-            )
-        else:
-            # 4d mask is passed through the layers
-            attention_mask = _prepare_4d_causal_attention_mask(
-                attention_mask, input_shape, inputs_embeds, past_key_values_length
-            )
-        # expand encoder attention mask
-        if encoder_hidden_states is not None and encoder_attention_mask is not None:
-            if self._use_flash_attention_2:
-                encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None
-            elif self._use_sdpa and cross_attn_head_mask is None and not output_attentions:
-                # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
-                # the manual implementation that requires a 4D causal mask in all cases.
-                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-                encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
-                    encoder_attention_mask,
-                    inputs_embeds.dtype,
-                    tgt_len=input_shape[-1],
-                )
-            else:
-                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-                encoder_attention_mask = _prepare_4d_attention_mask(
-                    encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
-                )
-        # embed positions
-        positions = self.embed_positions(input, past_key_values_length)
-        hidden_states = inputs_embeds + positions.to(inputs_embeds.device)
-        # TODO: add counting context weight to hidden_states
-        if count_pred is not None:
-            count_context_weight = self.counting_context_weight(count_pred)
-            hidden_states = hidden_states + 0.5 * count_context_weight.unsqueeze(1)
-        hidden_states = self.layernorm_embedding(hidden_states)
-        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
-        if self.gradient_checkpointing and self.training:
-            if use_cache:
-                logger.warning_once(
-                    "`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`..."
-                )
-                use_cache = False
-        # decoder layers
-        all_hidden_states = () if output_hidden_states else None
-        all_self_attns = () if output_attentions else None
-        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
-        next_decoder_cache = () if use_cache else None
-        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
-        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
-            if attn_mask is not None:
-                if attn_mask.size()[0] != len(self.layers):
-                    raise ValueError(
-                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
-                        f" {attn_mask.size()[0]}."
-                    )
-        for idx, decoder_layer in enumerate(self.layers):
-            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
-            if output_hidden_states:
-                all_hidden_states += (hidden_states,)
-            if self.training:
-                dropout_probability = torch.rand([])
-                if dropout_probability < self.layerdrop:
-                    continue
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    decoder_layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                    head_mask[idx] if head_mask is not None else None,
-                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
-                    None,
-                    output_attentions,
-                    use_cache,
-                )
-            else:
-                layer_outputs = decoder_layer(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    encoder_hidden_states=encoder_hidden_states,
-                    encoder_attention_mask=encoder_attention_mask,
-                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
-                    cross_attn_layer_head_mask=(
-                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
-                    ),
-                    past_key_value=past_key_value,
-                    output_attentions=output_attentions,
-                    use_cache=use_cache,
-                )
-            hidden_states = layer_outputs[0]
-            if use_cache:
-                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
-            if output_attentions:
-                all_self_attns += (layer_outputs[1],)
-                if encoder_hidden_states is not None:
-                    all_cross_attentions += (layer_outputs[2],)
-        hidden_states = self.layer_norm(hidden_states)
-        # add hidden states from the last decoder layer
-        if output_hidden_states:
-            all_hidden_states += (hidden_states,)
-        next_cache = next_decoder_cache if use_cache else None
-        if not return_dict:
-            return tuple(
-                v
-                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
-                if v is not None
-            )
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=next_cache,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attns,
-            cross_attentions=all_cross_attentions,
-        )
-@add_start_docstrings(
-    "The bare MBART Model outputting raw hidden-states without any specific head on top.",
-    MBART_START_DOCSTRING,
-)
-class UnimerMBartModel(UnimerMBartPreTrainedModel):
-    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
-    def __init__(self, config: UnimerMBartConfig):
-        super().__init__(config)
-        padding_idx, vocab_size = config.pad_token_id, config.vocab_size
-        self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
-        self.encoder = UnimerMBartEncoder(config, self.shared)
-        self.decoder = UnimerMBartDecoder(config, self.shared)
-        # Initialize weights and apply final processing
-        self.post_init()
-    def get_input_embeddings(self):
-        return self.shared
-    def set_input_embeddings(self, value):
-        self.shared = value
-        self.encoder.embed_tokens = self.shared
-        self.decoder.embed_tokens = self.shared
-    def get_encoder(self):
-        return self.encoder
-    def get_decoder(self):
-        return self.decoder
-    def _tie_weights(self):
-        if self.config.tie_word_embeddings:
-            self._tie_or_clone_weights(self.encoder.embed_tokens, self.get_input_embeddings())
-            self._tie_or_clone_weights(self.decoder.embed_tokens, self.get_input_embeddings())
-    @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=Seq2SeqModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-        expected_output=_EXPECTED_OUTPUT_SHAPE,
-    )
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        decoder_input_ids: Optional[torch.LongTensor] = None,
-        decoder_attention_mask: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        decoder_head_mask: Optional[torch.Tensor] = None,
-        cross_attn_head_mask: Optional[torch.Tensor] = None,
-        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Seq2SeqModelOutput, Tuple[torch.FloatTensor]]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        # different to other models, MBart automatically creates decoder_input_ids from
-        # input_ids if no decoder_input_ids are provided
-        if decoder_input_ids is None and decoder_inputs_embeds is None:
-            decoder_input_ids = shift_tokens_right(input_ids, self.config.pad_token_id)
-        if encoder_outputs is None:
-            encoder_outputs = self.encoder(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                head_mask=head_mask,
-                inputs_embeds=inputs_embeds,
-                output_attentions=output_attentions,
-                output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
-            )
-        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
-        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
-            encoder_outputs = BaseModelOutput(
-                last_hidden_state=encoder_outputs[0],
-                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
-                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
-            )
-        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
-        decoder_outputs = self.decoder(
-            input_ids=decoder_input_ids,
-            attention_mask=decoder_attention_mask,
-            encoder_hidden_states=encoder_outputs[0],
-            encoder_attention_mask=attention_mask,
-            head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=decoder_inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        if not return_dict:
-            return decoder_outputs + encoder_outputs
-        return Seq2SeqModelOutput(
-            last_hidden_state=decoder_outputs.last_hidden_state,
-            past_key_values=decoder_outputs.past_key_values,
-            decoder_hidden_states=decoder_outputs.hidden_states,
-            decoder_attentions=decoder_outputs.attentions,
-            cross_attentions=decoder_outputs.cross_attentions,
-            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
-            encoder_hidden_states=encoder_outputs.hidden_states,
-            encoder_attentions=encoder_outputs.attentions,
-        )
-@add_start_docstrings(
-    "The MBART Model with a language modeling head. Can be used for summarization, after fine-tuning the pretrained models.",
-    MBART_START_DOCSTRING,
-)
-class UnimerMBartForConditionalGeneration(UnimerMBartPreTrainedModel, GenerationMixin):
-    base_model_prefix = "model"
-    _keys_to_ignore_on_load_missing = ["final_logits_bias"]
-    _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight", "lm_head.weight"]
-    def __init__(self, config: UnimerMBartConfig):
-        super().__init__(config)
-        self.model = UnimerMBartModel(config)
-        self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
-        self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
-        # Initialize weights and apply final processing
-        self.post_init()
-    def get_encoder(self):
-        return self.model.get_encoder()
-    def get_decoder(self):
-        return self.model.get_decoder()
-    def resize_token_embeddings(self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None) -> nn.Embedding:
-        new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
-        self._resize_final_logits_bias(new_embeddings.weight.shape[0])
-        return new_embeddings
-    def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
-        old_num_tokens = self.final_logits_bias.shape[-1]
-        if new_num_tokens <= old_num_tokens:
-            new_bias = self.final_logits_bias[:, :new_num_tokens]
-        else:
-            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
-            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
-        self.register_buffer("final_logits_bias", new_bias)
-    def get_output_embeddings(self):
-        return self.lm_head
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-    @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING)
-    @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
-    @add_end_docstrings(MBART_GENERATION_EXAMPLE)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        decoder_input_ids: Optional[torch.LongTensor] = None,
-        decoder_attention_mask: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        decoder_head_mask: Optional[torch.Tensor] = None,
-        cross_attn_head_mask: Optional[torch.Tensor] = None,
-        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Seq2SeqLMOutput, Tuple[torch.FloatTensor]]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-        Returns:
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if labels is not None:
-            if use_cache:
-                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
-            use_cache = False
-            if decoder_input_ids is None and decoder_inputs_embeds is None:
-                decoder_input_ids = shift_tokens_right(labels, self.config.pad_token_id)
-        outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            encoder_outputs=encoder_outputs,
-            decoder_attention_mask=decoder_attention_mask,
-            head_mask=head_mask,
-            decoder_head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            decoder_inputs_embeds=decoder_inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias
-        masked_lm_loss = None
-        if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
-        if not return_dict:
-            output = (lm_logits,) + outputs[1:]
-            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
-        return Seq2SeqLMOutput(
-            loss=masked_lm_loss,
-            logits=lm_logits,
-            past_key_values=outputs.past_key_values,
-            decoder_hidden_states=outputs.decoder_hidden_states,
-            decoder_attentions=outputs.decoder_attentions,
-            cross_attentions=outputs.cross_attentions,
-            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
-            encoder_hidden_states=outputs.encoder_hidden_states,
-            encoder_attentions=outputs.encoder_attentions,
-        )
-    def prepare_inputs_for_generation(
-        self,
-        decoder_input_ids,
-        past_key_values=None,
-        attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
-        use_cache=None,
-        encoder_outputs=None,
-        **kwargs,
-    ):
-        # cut decoder_input_ids if past is used
-        if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[2]
-            # Some generation methods already pass only the last input ID
-            if decoder_input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = decoder_input_ids.shape[1] - 1
-            decoder_input_ids = decoder_input_ids[:, remove_prefix_length:]
-        return {
-            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
-            "encoder_outputs": encoder_outputs,
-            "past_key_values": past_key_values,
-            "decoder_input_ids": decoder_input_ids,
-            "attention_mask": attention_mask,
-            "head_mask": head_mask,
-            "decoder_head_mask": decoder_head_mask,
-            "cross_attn_head_mask": cross_attn_head_mask,
-            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
-        }
-    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
-        return shift_tokens_right(labels, self.config.pad_token_id)
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            # cached cross_attention states don't have to be reordered -> they are always the same
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
-                + layer_past[2:],
-            )
-        return reordered_past
-@add_start_docstrings(
-    """
-    MBart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
-    tasks.
-    """,
-    MBART_START_DOCSTRING,
-)
-class UnimerMBartForSequenceClassification(UnimerMBartPreTrainedModel):
-    _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight"]
-    def __init__(self, config: UnimerMBartConfig, **kwargs):
-        super().__init__(config, **kwargs)
-        self.model = UnimerMBartModel(config)
-        self.classification_head = UnimerMBartClassificationHead(
-            config.d_model,
-            config.d_model,
-            config.num_labels,
-            config.classifier_dropout,
-        )
-        # Initialize weights and apply final processing
-        self.post_init()
-    @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=Seq2SeqSequenceClassifierOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    # Copied from transformers.models.bart.modeling_bart.BartForSequenceClassification.forward
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        decoder_input_ids: Optional[torch.LongTensor] = None,
-        decoder_attention_mask: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        decoder_head_mask: Optional[torch.Tensor] = None,
-        cross_attn_head_mask: Optional[torch.Tensor] = None,
-        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, Seq2SeqSequenceClassifierOutput]:
-        r"""
-        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
-            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if labels is not None:
-            use_cache = False
-        if input_ids is None and inputs_embeds is not None:
-            raise NotImplementedError(
-                f"Passing input embeddings is currently not supported for {self.__class__.__name__}"
-            )
-        outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            head_mask=head_mask,
-            decoder_head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            encoder_outputs=encoder_outputs,
-            inputs_embeds=inputs_embeds,
-            decoder_inputs_embeds=decoder_inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        hidden_states = outputs[0]  # last hidden state
-        eos_mask = input_ids.eq(self.config.eos_token_id).to(hidden_states.device)
-        if len(torch.unique_consecutive(eos_mask.sum(1))) > 1:
-            raise ValueError("All examples must have the same number of <eos> tokens.")
-        sentence_representation = hidden_states[eos_mask, :].view(hidden_states.size(0), -1, hidden_states.size(-1))[
-            :, -1, :
-        ]
-        logits = self.classification_head(sentence_representation)
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.config.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.config.num_labels == 1:
-                    loss = loss_fct(logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(logits, labels)
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return ((loss,) + output) if loss is not None else output
-        return Seq2SeqSequenceClassifierOutput(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            decoder_hidden_states=outputs.decoder_hidden_states,
-            decoder_attentions=outputs.decoder_attentions,
-            cross_attentions=outputs.cross_attentions,
-            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
-            encoder_hidden_states=outputs.encoder_hidden_states,
-            encoder_attentions=outputs.encoder_attentions,
-        )
-@add_start_docstrings(
-    """
-    MBART Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
-    layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
-    """,
-    MBART_START_DOCSTRING,
-)
-class UnimerMBartForQuestionAnswering(UnimerMBartPreTrainedModel):
-    _tied_weights_keys = ["model.encoder.embed_tokens.weight", "model.decoder.embed_tokens.weight"]
-    def __init__(self, config):
-        super().__init__(config)
-        config.num_labels = 2
-        self.num_labels = config.num_labels
-        self.model = UnimerMBartModel(config)
-        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
-        # Initialize weights and apply final processing
-        self.post_init()
-    @add_start_docstrings_to_model_forward(MBART_INPUTS_DOCSTRING)
-    @add_code_sample_docstrings(
-        checkpoint=_CHECKPOINT_FOR_DOC,
-        output_type=Seq2SeqQuestionAnsweringModelOutput,
-        config_class=_CONFIG_FOR_DOC,
-    )
-    # Copied from transformers.models.bart.modeling_bart.BartForQuestionAnswering.forward
-    def forward(
-        self,
-        input_ids: torch.Tensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        decoder_input_ids: Optional[torch.LongTensor] = None,
-        decoder_attention_mask: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        decoder_head_mask: Optional[torch.Tensor] = None,
-        cross_attn_head_mask: Optional[torch.Tensor] = None,
-        encoder_outputs: Optional[List[torch.FloatTensor]] = None,
-        start_positions: Optional[torch.LongTensor] = None,
-        end_positions: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, Seq2SeqQuestionAnsweringModelOutput]:
-        r"""
-        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for position (index) of the start of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
-            are not taken into account for computing the loss.
-        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
-            Labels for position (index) of the end of the labelled span for computing the token classification loss.
-            Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence
-            are not taken into account for computing the loss.
-        """
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        if start_positions is not None and end_positions is not None:
-            use_cache = False
-        outputs = self.model(
-            input_ids,
-            attention_mask=attention_mask,
-            decoder_input_ids=decoder_input_ids,
-            decoder_attention_mask=decoder_attention_mask,
-            head_mask=head_mask,
-            decoder_head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            encoder_outputs=encoder_outputs,
-            inputs_embeds=inputs_embeds,
-            decoder_inputs_embeds=decoder_inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        sequence_output = outputs[0]
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = logits.split(1, dim=-1)
-        start_logits = start_logits.squeeze(-1).contiguous()
-        end_logits = end_logits.squeeze(-1).contiguous()
-        total_loss = None
-        if start_positions is not None and end_positions is not None:
-            # If we are on multi-GPU, split add a dimension
-            if len(start_positions.size()) > 1:
-                start_positions = start_positions.squeeze(-1)
-            if len(end_positions.size()) > 1:
-                end_positions = end_positions.squeeze(-1)
-            # sometimes the start/end positions are outside our model inputs, we ignore these terms
-            ignored_index = start_logits.size(1)
-            start_positions = start_positions.clamp(0, ignored_index)
-            end_positions = end_positions.clamp(0, ignored_index)
-            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
-            start_loss = loss_fct(start_logits, start_positions)
-            end_loss = loss_fct(end_logits, end_positions)
-            total_loss = (start_loss + end_loss) / 2
-        if not return_dict:
-            output = (
-                start_logits,
-                end_logits,
-            ) + outputs[1:]
-            return ((total_loss,) + output) if total_loss is not None else output
-        return Seq2SeqQuestionAnsweringModelOutput(
-            loss=total_loss,
-            start_logits=start_logits,
-            end_logits=end_logits,
-            past_key_values=outputs.past_key_values,
-            decoder_hidden_states=outputs.decoder_hidden_states,
-            decoder_attentions=outputs.decoder_attentions,
-            cross_attentions=outputs.cross_attentions,
-            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
-            encoder_hidden_states=outputs.encoder_hidden_states,
-            encoder_attentions=outputs.encoder_attentions,
-        )
-# Copied from transformers.models.bart.modeling_bart.BartDecoderWrapper with Bart->MBart
-class UnimerMBartDecoderWrapper(UnimerMBartPreTrainedModel):
-    """
-    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
-    used in combination with the [`EncoderDecoderModel`] framework.
-    """
-    def __init__(self, config):
-        super().__init__(config)
-        self.decoder = UnimerMBartDecoder(config)
-    def forward(self, *args, **kwargs):
-        return self.decoder(*args, **kwargs)
-# Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->MBart, facebook/bart-base->facebook/mbart-large-cc25
-class UnimerMBartForCausalLM(UnimerMBartPreTrainedModel, GenerationMixin):
-    _tied_weights_keys = ["lm_head.weight"]
-    def __init__(self, config):
-        config = copy.deepcopy(config)
-        config.is_decoder = True
-        config.is_encoder_decoder = False
-        super().__init__(config)
-        self.model = UnimerMBartDecoderWrapper(config)
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-        # Initialize weights and apply final processing
-        self.post_init()
-    def get_input_embeddings(self):
-        return self.model.decoder.embed_tokens
-    def set_input_embeddings(self, value):
-        self.model.decoder.embed_tokens = value
-    def get_output_embeddings(self):
-        return self.lm_head
-    def set_output_embeddings(self, new_embeddings):
-        self.lm_head = new_embeddings
-    def set_decoder(self, decoder):
-        self.model.decoder = decoder
-    def get_decoder(self):
-        return self.model.decoder
-    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentionsAndCounting, config_class=_CONFIG_FOR_DOC)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        encoder_attention_mask: Optional[torch.FloatTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        cross_attn_head_mask: Optional[torch.Tensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        count_gt: Optional[torch.LongTensor] = None,
-    ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]:
-        r"""
-        Args:
-            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
-                provide it.
-                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-                [`PreTrainedTokenizer.__call__`] for details.
-                [What are input IDs?](../glossary#input-ids)
-            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-                [What are attention masks?](../glossary#attention-mask)
-            encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
-                if the model is configured as a decoder.
-            encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
-                in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
-            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
-                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
-                - 1 indicates the head is **not masked**,
-                - 0 indicates the head is **masked**.
-            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
-                Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
-                - 1 indicates the head is **not masked**,
-                - 0 indicates the head is **masked**.
-            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
-                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
-                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional
-                tensors are only required when the model is used as a decoder in a Sequence to Sequence model.
-                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
-                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
-                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
-                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-            use_cache (`bool`, *optional*):
-                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
-                (see `past_key_values`).
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            output_hidden_states (`bool`, *optional*):
-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                for more detail.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        Returns:
-        Example:
-        ```python
-        >>> from transformers import AutoTokenizer, MBartForCausalLM
-        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-cc25")
-        >>> model = MBartForCausalLM.from_pretrained("facebook/mbart-large-cc25", add_cross_attention=False)
-        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
-        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
-        >>> outputs = model(**inputs)
-        >>> logits = outputs.logits
-        >>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size]
-        >>> list(logits.shape) == expected_shape
-        True
-        ```"""
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        count_pred = None
-        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-        outputs = self.model.decoder(
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            count_pred=count_pred,
-            encoder_hidden_states=encoder_hidden_states,
-            encoder_attention_mask=encoder_attention_mask,
-            head_mask=head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-        logits = self.lm_head(outputs[0])
-        loss = None
-        if labels is not None:
-            labels = labels.to(logits.device)
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-        return CausalLMOutputWithCrossAttentionsAndCounting(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            cross_attentions=outputs.cross_attentions,
-            counting=count_pred,
-        )
-    def prepare_inputs_for_generation(
-        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
-    ):
-        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
-        if attention_mask is None:
-            attention_mask = input_ids.new_ones(input_ids.shape)
-        if past_key_values:
-            past_length = past_key_values[0][0].shape[2]
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
-            else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-            input_ids = input_ids[:, remove_prefix_length:]
-        # first step, decoder_cached_states are empty
-        return {
-            "input_ids": input_ids,  # encoder_outputs is defined. input_ids not needed
-            "attention_mask": attention_mask,
-            "past_key_values": past_key_values,
-            "use_cache": use_cache,
-        }
-    @staticmethod
-    def _reorder_cache(past_key_values, beam_idx):
-        reordered_past = ()
-        for layer_past in past_key_values:
-            reordered_past += (
-                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
-            )
-        return reordered_past
--- a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/tokenization_unimer_mbart.py
+++ b/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_mbart/tokenization_unimer_mbart.py
--- a/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/__init__.py
+++ b/magic_pdf/model/sub_modules/mfr/unimernet/unimernet_hf/unimer_swin/__init__.py
-from .configuration_unimer_swin import UnimerSwinConfig
-from .modeling_unimer_swin import UnimerSwinModel
-from .image_processing_unimer_swin import UnimerSwinImageProcessor
-__all__ = [
-    "UnimerSwinConfig",
-    "UnimerSwinModel",
-    "UnimerSwinImageProcessor",
-]