"tests/vscode:/vscode.git/clone" did not exist on "7ec9128e5a77980b17d5158241c4a182536d5668"
Unverified Commit 408b2d2b authored by NielsRogge's avatar NielsRogge Committed by GitHub
Browse files

Add TrOCR + VisionEncoderDecoderModel (#13874)

* First draft

* Update self-attention of RoBERTa as proposition

* Improve conversion script

* Add TrOCR decoder-only model

* More improvements

* Make forward pass with pretrained weights work

* More improvements

* Some more improvements

* More improvements

* Make conversion work

* Clean up print statements

* Add documentation, processor

* Add test files

* Small improvements

* Some more improvements

* Make fix-copies, improve docs

* Make all vision encoder decoder model tests pass

* Make conversion script support other models

* Update URL for OCR image

* Update conversion script

* Fix style & quality

* Add support for the large-printed model

* Fix some issues

* Add print statement for debugging

* Add print statements for debugging

* Make possible fix for sinusoidal embedding

* Further debugging

* Potential fix v2

* Add more print statements for debugging

* Add more print statements for debugging

* Deubg more

* Comment out print statements

* Make conversion of large printed model possible, address review comments

* Make it possible to convert the stage1 checkpoints

* Clean up code, apply suggestions from code review

* Apply suggestions from code review, use Microsoft models in tests

* Rename encoder_hidden_size to cross_attention_hidden_size

* Improve docs
parent 61f64262
...@@ -63,6 +63,8 @@ class ViTConfig(PretrainedConfig): ...@@ -63,6 +63,8 @@ class ViTConfig(PretrainedConfig):
The size (resolution) of each patch. The size (resolution) of each patch.
num_channels (:obj:`int`, `optional`, defaults to :obj:`3`): num_channels (:obj:`int`, `optional`, defaults to :obj:`3`):
The number of input channels. The number of input channels.
qkv_bias (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to add a bias to the queries, keys and values.
Example:: Example::
...@@ -95,6 +97,7 @@ class ViTConfig(PretrainedConfig): ...@@ -95,6 +97,7 @@ class ViTConfig(PretrainedConfig):
image_size=224, image_size=224,
patch_size=16, patch_size=16,
num_channels=3, num_channels=3,
qkv_bias=True,
**kwargs **kwargs
): ):
super().__init__(**kwargs) super().__init__(**kwargs)
...@@ -112,3 +115,4 @@ class ViTConfig(PretrainedConfig): ...@@ -112,3 +115,4 @@ class ViTConfig(PretrainedConfig):
self.image_size = image_size self.image_size = image_size
self.patch_size = patch_size self.patch_size = patch_size
self.num_channels = num_channels self.num_channels = num_channels
self.qkv_bias = qkv_bias
...@@ -139,16 +139,19 @@ class FlaxViTSelfAttention(nn.Module): ...@@ -139,16 +139,19 @@ class FlaxViTSelfAttention(nn.Module):
self.config.hidden_size, self.config.hidden_size,
dtype=self.dtype, dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range, self.dtype), kernel_init=jax.nn.initializers.normal(self.config.initializer_range, self.dtype),
use_bias=self.config.qkv_bias,
) )
self.key = nn.Dense( self.key = nn.Dense(
self.config.hidden_size, self.config.hidden_size,
dtype=self.dtype, dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range, self.dtype), kernel_init=jax.nn.initializers.normal(self.config.initializer_range, self.dtype),
use_bias=self.config.qkv_bias,
) )
self.value = nn.Dense( self.value = nn.Dense(
self.config.hidden_size, self.config.hidden_size,
dtype=self.dtype, dtype=self.dtype,
kernel_init=jax.nn.initializers.normal(self.config.initializer_range, self.dtype), kernel_init=jax.nn.initializers.normal(self.config.initializer_range, self.dtype),
use_bias=self.config.qkv_bias,
) )
def __call__(self, hidden_states, deterministic: bool = True, output_attentions: bool = False): def __call__(self, hidden_states, deterministic: bool = True, output_attentions: bool = False):
......
...@@ -169,9 +169,9 @@ class ViTSelfAttention(nn.Module): ...@@ -169,9 +169,9 @@ class ViTSelfAttention(nn.Module):
self.attention_head_size = int(config.hidden_size / config.num_attention_heads) self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size self.all_head_size = self.num_attention_heads * self.attention_head_size
self.query = nn.Linear(config.hidden_size, self.all_head_size) self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
self.key = nn.Linear(config.hidden_size, self.all_head_size) self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
self.value = nn.Linear(config.hidden_size, self.all_head_size) self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
self.dropout = nn.Dropout(config.attention_probs_dropout_prob) self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
...@@ -505,6 +505,7 @@ class ViTModel(ViTPreTrainedModel): ...@@ -505,6 +505,7 @@ class ViTModel(ViTPreTrainedModel):
def forward( def forward(
self, self,
pixel_values=None, pixel_values=None,
attention_mask=None,
head_mask=None, head_mask=None,
output_attentions=None, output_attentions=None,
output_hidden_states=None, output_hidden_states=None,
......
...@@ -3587,6 +3587,36 @@ def load_tf_weights_in_transfo_xl(*args, **kwargs): ...@@ -3587,6 +3587,36 @@ def load_tf_weights_in_transfo_xl(*args, **kwargs):
requires_backends(load_tf_weights_in_transfo_xl, ["torch"]) requires_backends(load_tf_weights_in_transfo_xl, ["torch"])
TROCR_PRETRAINED_MODEL_ARCHIVE_LIST = None
class TrOCRForCausalLM:
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
class TrOCRPreTrainedModel:
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
class VisionEncoderDecoderModel:
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
VISUAL_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = None VISUAL_BERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
......
# coding=utf-8
# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Testing suite for the PyTorch TrOCR model. """
import unittest
from transformers import TrOCRConfig
from transformers.testing_utils import is_torch_available, require_torch, torch_device
from .test_configuration_common import ConfigTester
from .test_generation_utils import GenerationTesterMixin
from .test_modeling_common import ModelTesterMixin, ids_tensor
if is_torch_available():
import torch
from transformers.models.trocr.modeling_trocr import TrOCRDecoder, TrOCRForCausalLM
@require_torch
class TrOCRStandaloneDecoderModelTester:
def __init__(
self,
parent,
vocab_size=99,
batch_size=13,
d_model=16,
decoder_seq_length=7,
is_training=True,
is_decoder=True,
use_attention_mask=True,
use_cache=False,
use_labels=True,
decoder_start_token_id=2,
decoder_ffn_dim=32,
decoder_layers=4,
decoder_attention_heads=4,
max_position_embeddings=30,
pad_token_id=0,
bos_token_id=1,
eos_token_id=2,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.decoder_seq_length = decoder_seq_length
# For common tests
self.seq_length = self.decoder_seq_length
self.is_training = is_training
self.use_attention_mask = use_attention_mask
self.use_labels = use_labels
self.vocab_size = vocab_size
self.d_model = d_model
self.hidden_size = d_model
self.num_hidden_layers = decoder_layers
self.decoder_layers = decoder_layers
self.decoder_ffn_dim = decoder_ffn_dim
self.decoder_attention_heads = decoder_attention_heads
self.num_attention_heads = decoder_attention_heads
self.eos_token_id = eos_token_id
self.bos_token_id = bos_token_id
self.pad_token_id = pad_token_id
self.decoder_start_token_id = decoder_start_token_id
self.use_cache = use_cache
self.max_position_embeddings = max_position_embeddings
self.scope = None
self.decoder_key_length = decoder_seq_length
self.base_model_out_len = 2
self.decoder_attention_idx = 1
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
attention_mask = None
if self.use_attention_mask:
attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
lm_labels = None
if self.use_labels:
lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
config = TrOCRConfig(
vocab_size=self.vocab_size,
d_model=self.d_model,
decoder_layers=self.decoder_layers,
decoder_ffn_dim=self.decoder_ffn_dim,
decoder_attention_heads=self.decoder_attention_heads,
eos_token_id=self.eos_token_id,
bos_token_id=self.bos_token_id,
use_cache=self.use_cache,
pad_token_id=self.pad_token_id,
decoder_start_token_id=self.decoder_start_token_id,
max_position_embeddings=self.max_position_embeddings,
)
return (config, input_ids, attention_mask, lm_labels)
def create_and_check_decoder_model_past(
self,
config,
input_ids,
attention_mask,
lm_labels,
):
config.use_cache = True
model = TrOCRDecoder(config=config).to(torch_device).eval()
input_ids = input_ids[:2]
input_ids[input_ids == 0] += 1
# first forward pass
outputs = model(input_ids, use_cache=True)
outputs_use_cache_conf = model(input_ids)
outputs_no_past = model(input_ids, use_cache=False)
self.parent.assertTrue(len(outputs) == len(outputs_use_cache_conf))
self.parent.assertTrue(len(outputs) == len(outputs_no_past) + 1)
past_key_values = outputs["past_key_values"]
# create hypothetical next token and extent to next_input_ids
next_tokens = ids_tensor((2, 1), config.vocab_size - 1) + 1
# append to next input_ids and
next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
output_from_no_past = model(next_input_ids)["last_hidden_state"]
output_from_past = model(next_tokens, past_key_values=past_key_values)["last_hidden_state"]
# select random slice
random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
output_from_no_past_slice = output_from_no_past[:, next_input_ids.shape[-1] - 1, random_slice_idx].detach()
output_from_past_slice = output_from_past[:, 0, random_slice_idx].detach()
# test that outputs are equal for slice
assert torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
config, input_ids, attention_mask, lm_labels = config_and_inputs
inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask}
return config, inputs_dict
@require_torch
class TrOCRStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
all_model_classes = (TrOCRDecoder, TrOCRForCausalLM) if is_torch_available() else ()
all_generative_model_classes = (TrOCRForCausalLM,) if is_torch_available() else ()
test_pruning = False
def setUp(self):
self.model_tester = TrOCRStandaloneDecoderModelTester(self, is_training=False)
self.config_tester = ConfigTester(self, config_class=TrOCRConfig)
# not implemented currently
def test_inputs_embeds(self):
pass
# trocr has no base model
def test_save_load_fast_init_from_base(self):
pass
# trocr has no base model
def test_save_load_fast_init_to_base(self):
pass
def test_config(self):
self.config_tester.run_common_tests()
def test_decoder_model_past(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_decoder_model_past(*config_and_inputs)
# decoder cannot keep gradients
def test_retain_grad_hidden_states_attentions(self):
return
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment