Unverified Commit 01f0fd0b authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Fixes for LayoutLM (#7318)

parent 702a76ff
...@@ -40,40 +40,40 @@ class LayoutLMConfig(BertConfig): ...@@ -40,40 +40,40 @@ class LayoutLMConfig(BertConfig):
Args: Args:
vocab_size (:obj:`int`, optional, defaults to 30522): vocab_size (:obj:`int`, `optional`, defaults to 30522):
Vocabulary size of the LayoutLM model. Defines the different tokens that Vocabulary size of the LayoutLM model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.LayoutLMModel`. can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.LayoutLMModel`.
hidden_size (:obj:`int`, optional, defaults to 768): hidden_size (:obj:`int`, `optional`, defaults to 768):
Dimensionality of the encoder layers and the pooler layer. Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, optional, defaults to 12): num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
Number of hidden layers in the Transformer encoder. Number of hidden layers in the Transformer encoder.
num_attention_heads (:obj:`int`, optional, defaults to 12): num_attention_heads (:obj:`int`, `optional`, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder. Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, optional, defaults to 3072): intermediate_size (:obj:`int`, `optional`, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"): hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported. If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1): hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler. The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1): attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
The dropout ratio for the attention probabilities. The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, optional, defaults to 512): max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
The maximum sequence length that this model might ever be used with. The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048). Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, optional, defaults to 2): type_vocab_size (:obj:`int`, `optional`, defaults to 2):
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`. The vocabulary size of the :obj:`token_type_ids` passed into :class:`~transformers.LayoutLMModel`.
initializer_range (:obj:`float`, optional, defaults to 0.02): initializer_range (:obj:`float`, `optional`, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12): layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
The epsilon used by the layer normalization layers. The epsilon used by the layer normalization layers.
gradient_checkpointing (:obj:`bool`, optional, defaults to :obj:`False`): gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
If True, use gradient checkpointing to save memory at the expense of slower backward pass. If True, use gradient checkpointing to save memory at the expense of slower backward pass.
max_2d_position_embeddings (:obj:`int`, optional, defaults to 1024): max_2d_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
The maximum value that the 2D position embedding might ever used. The maximum value that the 2D position embedding might ever used.
Typically set this to something large just in case (e.g., 1024). Typically set this to something large just in case (e.g., 1024).
Example:: Examples::
>>> from transformers import LayoutLMModel, LayoutLMConfig >>> from transformers import LayoutLMModel, LayoutLMConfig
......
...@@ -118,6 +118,7 @@ class LayoutLMEmbeddings(nn.Module): ...@@ -118,6 +118,7 @@ class LayoutLMEmbeddings(nn.Module):
return embeddings return embeddings
# Copied from transformers.modeling_bert.BertSelfAttention with Bert->LayoutLM
class LayoutLMSelfAttention(nn.Module): class LayoutLMSelfAttention(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -172,6 +173,7 @@ class LayoutLMSelfAttention(nn.Module): ...@@ -172,6 +173,7 @@ class LayoutLMSelfAttention(nn.Module):
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size) attention_scores = attention_scores / math.sqrt(self.attention_head_size)
if attention_mask is not None: if attention_mask is not None:
# Apply the attention mask is (precomputed for all layers in LayoutLMModel forward() function)
attention_scores = attention_scores + attention_mask attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities. # Normalize the attention scores to probabilities.
...@@ -195,6 +197,7 @@ class LayoutLMSelfAttention(nn.Module): ...@@ -195,6 +197,7 @@ class LayoutLMSelfAttention(nn.Module):
return outputs return outputs
# Copied from transformers.modeling_bert.BertSelfOutput with Bert->LayoutLM
class LayoutLMSelfOutput(nn.Module): class LayoutLMSelfOutput(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -209,6 +212,7 @@ class LayoutLMSelfOutput(nn.Module): ...@@ -209,6 +212,7 @@ class LayoutLMSelfOutput(nn.Module):
return hidden_states return hidden_states
# Copied from transformers.modeling_bert.BertAttention with Bert->LayoutLM
class LayoutLMAttention(nn.Module): class LayoutLMAttention(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -256,6 +260,7 @@ class LayoutLMAttention(nn.Module): ...@@ -256,6 +260,7 @@ class LayoutLMAttention(nn.Module):
return outputs return outputs
# Copied from transformers.modeling_bert.BertIntermediate
class LayoutLMIntermediate(nn.Module): class LayoutLMIntermediate(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -271,6 +276,7 @@ class LayoutLMIntermediate(nn.Module): ...@@ -271,6 +276,7 @@ class LayoutLMIntermediate(nn.Module):
return hidden_states return hidden_states
# Copied from transformers.modeling_bert.BertOutput with Bert->LayoutLM
class LayoutLMOutput(nn.Module): class LayoutLMOutput(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -285,6 +291,7 @@ class LayoutLMOutput(nn.Module): ...@@ -285,6 +291,7 @@ class LayoutLMOutput(nn.Module):
return hidden_states return hidden_states
# Copied from transformers.modeling_bert.BertLayer with Bert->LayoutLM
class LayoutLMLayer(nn.Module): class LayoutLMLayer(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -344,6 +351,7 @@ class LayoutLMLayer(nn.Module): ...@@ -344,6 +351,7 @@ class LayoutLMLayer(nn.Module):
return layer_output return layer_output
# Copied from transformers.modeling_bert.BertEncoder with Bert->LayoutLM
class LayoutLMEncoder(nn.Module): class LayoutLMEncoder(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -408,6 +416,7 @@ class LayoutLMEncoder(nn.Module): ...@@ -408,6 +416,7 @@ class LayoutLMEncoder(nn.Module):
) )
# Copied from transformers.modeling_bert.BertPooler
class LayoutLMPooler(nn.Module): class LayoutLMPooler(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -423,6 +432,7 @@ class LayoutLMPooler(nn.Module): ...@@ -423,6 +432,7 @@ class LayoutLMPooler(nn.Module):
return pooled_output return pooled_output
# Copied from transformers.modeling_bert.BertPredictionHeadTransform with Bert->LayoutLM
class LayoutLMPredictionHeadTransform(nn.Module): class LayoutLMPredictionHeadTransform(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -440,6 +450,7 @@ class LayoutLMPredictionHeadTransform(nn.Module): ...@@ -440,6 +450,7 @@ class LayoutLMPredictionHeadTransform(nn.Module):
return hidden_states return hidden_states
# Copied from transformers.modeling_bert.BertLMPredictionHead with Bert->LayoutLM
class LayoutLMLMPredictionHead(nn.Module): class LayoutLMLMPredictionHead(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -460,6 +471,7 @@ class LayoutLMLMPredictionHead(nn.Module): ...@@ -460,6 +471,7 @@ class LayoutLMLMPredictionHead(nn.Module):
return hidden_states return hidden_states
# Copied from transformers.modeling_bert.BertOnlyMLMHead with Bert->LayoutLM
class LayoutLMOnlyMLMHead(nn.Module): class LayoutLMOnlyMLMHead(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -470,28 +482,6 @@ class LayoutLMOnlyMLMHead(nn.Module): ...@@ -470,28 +482,6 @@ class LayoutLMOnlyMLMHead(nn.Module):
return prediction_scores return prediction_scores
class LayoutLMOnlyNSPHead(nn.Module):
def __init__(self, config):
super().__init__()
self.seq_relationship = nn.Linear(config.hidden_size, 2)
def forward(self, pooled_output):
seq_relationship_score = self.seq_relationship(pooled_output)
return seq_relationship_score
class LayoutLMPreTrainingHeads(nn.Module):
def __init__(self, config):
super().__init__()
self.predictions = LayoutLMLMPredictionHead(config)
self.seq_relationship = nn.Linear(config.hidden_size, 2)
def forward(self, sequence_output, pooled_output):
prediction_scores = self.predictions(sequence_output)
seq_relationship_score = self.seq_relationship(pooled_output)
return prediction_scores, seq_relationship_score
class LayoutLMPreTrainedModel(PreTrainedModel): class LayoutLMPreTrainedModel(PreTrainedModel):
"""An abstract class to handle weights initialization and """An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models. a simple interface for downloading and loading pretrained models.
......
...@@ -142,7 +142,7 @@ class RobertaEmbeddings(nn.Module): ...@@ -142,7 +142,7 @@ class RobertaEmbeddings(nn.Module):
return position_ids.unsqueeze(0).expand(input_shape) return position_ids.unsqueeze(0).expand(input_shape)
# Copied from transformers.modeling_bert.BertSelfAttention # Copied from transformers.modeling_bert.BertSelfAttention with Bert->Roberta
class RobertaSelfAttention(nn.Module): class RobertaSelfAttention(nn.Module):
def __init__(self, config): def __init__(self, config):
super().__init__() super().__init__()
...@@ -197,7 +197,7 @@ class RobertaSelfAttention(nn.Module): ...@@ -197,7 +197,7 @@ class RobertaSelfAttention(nn.Module):
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size) attention_scores = attention_scores / math.sqrt(self.attention_head_size)
if attention_mask is not None: if attention_mask is not None:
# Apply the attention mask is (precomputed for all layers in BertModel forward() function) # Apply the attention mask is (precomputed for all layers in RobertaModel forward() function)
attention_scores = attention_scores + attention_mask attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities. # Normalize the attention scores to probabilities.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment