Fixes for LayoutLM (#7318)

01f0fd0b · Sylvain Gugger · GitHub · 702a76ff · 01f0fd0b · 01f0fd0b
Unverified Commit 01f0fd0b authored Sep 22, 2020 by Sylvain Gugger Committed by GitHub Sep 22, 2020
3 changed files
--- a/src/transformers/configuration_layoutlm.py
+++ b/src/transformers/configuration_layoutlm.py
@@ -40,40 +40,40 @@ class LayoutLMConfig(BertConfig):
    Args:
-        vocab_size (:obj:`int`, optional, defaults to 30522):
+        vocab_size (:obj:`int`, `optional`, defaults to 30522):
            Vocabulary size of the LayoutLM model. Defines the different tokens that
            can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.LayoutLMModel`.
-        hidden_size (:obj:`int`, optional, defaults to 768):
+        hidden_size (:obj:`int`, `optional`, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
-        num_hidden_layers (:obj:`int`, optional, defaults to 12):
+        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (:obj:`int`, optional, defaults to 12):
+        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
-        intermediate_size (:obj:`int`, optional, defaults to 3072):
+        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+        hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler.
-            If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+            If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
-        hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
            The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
            The dropout ratio for the attention probabilities.
-        max_position_embeddings (:obj:`int`, optional, defaults to 512):
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
            The maximum sequence length that this model might ever be used with.
            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-        type_vocab_size (:obj:`int`, optional, defaults to 2):
+        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
-            The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
+            The vocabulary size of the :obj:`token_type_ids` passed into :class:`~transformers.LayoutLMModel`.
-        initializer_range (:obj:`float`, optional, defaults to 0.02):
+        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
-        gradient_checkpointing (:obj:`bool`, optional, defaults to :obj:`False`):
+        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
-        max_2d_position_embeddings (:obj:`int`, optional, defaults to 1024):
+        max_2d_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
            The maximum value that the 2D position embedding might ever used.
            Typically set this to something large just in case (e.g., 1024).
-    Example::
+    Examples::
        >>> from transformers import LayoutLMModel, LayoutLMConfig

--- a/src/transformers/modeling_layoutlm.py
+++ b/src/transformers/modeling_layoutlm.py
@@ -118,6 +118,7 @@ class LayoutLMEmbeddings(nn.Module):
        return embeddings
+# Copied from transformers.modeling_bert.BertSelfAttention with Bert->LayoutLM
 class LayoutLMSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
@@ -172,6 +173,7 @@ class LayoutLMSelfAttention(nn.Module):
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in LayoutLMModel forward() function)
            attention_scores = attention_scores + attention_mask
        # Normalize the attention scores to probabilities.
@@ -195,6 +197,7 @@ class LayoutLMSelfAttention(nn.Module):
        return outputs
+# Copied from transformers.modeling_bert.BertSelfOutput with Bert->LayoutLM
 class LayoutLMSelfOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
@@ -209,6 +212,7 @@ class LayoutLMSelfOutput(nn.Module):
        return hidden_states
+# Copied from transformers.modeling_bert.BertAttention with Bert->LayoutLM
 class LayoutLMAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
@@ -256,6 +260,7 @@ class LayoutLMAttention(nn.Module):
        return outputs
+# Copied from transformers.modeling_bert.BertIntermediate
 class LayoutLMIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
@@ -271,6 +276,7 @@ class LayoutLMIntermediate(nn.Module):
        return hidden_states
+# Copied from transformers.modeling_bert.BertOutput with Bert->LayoutLM
 class LayoutLMOutput(nn.Module):
    def __init__(self, config):
        super().__init__()
@@ -285,6 +291,7 @@ class LayoutLMOutput(nn.Module):
        return hidden_states
+# Copied from transformers.modeling_bert.BertLayer with Bert->LayoutLM
 class LayoutLMLayer(nn.Module):
    def __init__(self, config):
        super().__init__()
@@ -344,6 +351,7 @@ class LayoutLMLayer(nn.Module):
        return layer_output
+# Copied from transformers.modeling_bert.BertEncoder with Bert->LayoutLM
 class LayoutLMEncoder(nn.Module):
    def __init__(self, config):
        super().__init__()
@@ -408,6 +416,7 @@ class LayoutLMEncoder(nn.Module):
        )
+# Copied from transformers.modeling_bert.BertPooler
 class LayoutLMPooler(nn.Module):
    def __init__(self, config):
        super().__init__()
@@ -423,6 +432,7 @@ class LayoutLMPooler(nn.Module):
        return pooled_output
+# Copied from transformers.modeling_bert.BertPredictionHeadTransform with Bert->LayoutLM
 class LayoutLMPredictionHeadTransform(nn.Module):
    def __init__(self, config):
        super().__init__()
@@ -440,6 +450,7 @@ class LayoutLMPredictionHeadTransform(nn.Module):
        return hidden_states
+# Copied from transformers.modeling_bert.BertLMPredictionHead with Bert->LayoutLM
 class LayoutLMLMPredictionHead(nn.Module):
    def __init__(self, config):
        super().__init__()
@@ -460,6 +471,7 @@ class LayoutLMLMPredictionHead(nn.Module):
        return hidden_states
+# Copied from transformers.modeling_bert.BertOnlyMLMHead with Bert->LayoutLM
 class LayoutLMOnlyMLMHead(nn.Module):
    def __init__(self, config):
        super().__init__()
@@ -470,28 +482,6 @@ class LayoutLMOnlyMLMHead(nn.Module):
        return prediction_scores
-class LayoutLMOnlyNSPHead(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.seq_relationship = nn.Linear(config.hidden_size, 2)
-    def forward(self, pooled_output):
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return seq_relationship_score
-class LayoutLMPreTrainingHeads(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.predictions = LayoutLMLMPredictionHead(config)
-        self.seq_relationship = nn.Linear(config.hidden_size, 2)
-    def forward(self, sequence_output, pooled_output):
-        prediction_scores = self.predictions(sequence_output)
-        seq_relationship_score = self.seq_relationship(pooled_output)
-        return prediction_scores, seq_relationship_score
 class LayoutLMPreTrainedModel(PreTrainedModel):
    """An abstract class to handle weights initialization and
    a simple interface for downloading and loading pretrained models.

--- a/src/transformers/modeling_roberta.py
+++ b/src/transformers/modeling_roberta.py
@@ -142,7 +142,7 @@ class RobertaEmbeddings(nn.Module):
        return position_ids.unsqueeze(0).expand(input_shape)
-# Copied from transformers.modeling_bert.BertSelfAttention
+# Copied from transformers.modeling_bert.BertSelfAttention with Bert->Roberta
 class RobertaSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
@@ -197,7 +197,7 @@ class RobertaSelfAttention(nn.Module):
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
        if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            # Apply the attention mask is (precomputed for all layers in RobertaModel forward() function)
            attention_scores = attention_scores + attention_mask
        # Normalize the attention scores to probabilities.