Unverified Commit 01f0fd0b authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Fixes for LayoutLM (#7318)

parent 702a76ff
......@@ -40,40 +40,40 @@ class LayoutLMConfig(BertConfig):
Args:
vocab_size (:obj:`int`, optional, defaults to 30522):
vocab_size (:obj:`int`, `optional`, defaults to 30522):
Vocabulary size of the LayoutLM model. Defines the different tokens that
can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.LayoutLMModel`.
hidden_size (:obj:`int`, optional, defaults to 768):
hidden_size (:obj:`int`, `optional`, defaults to 768):
Dimensionality of the encoder layers and the pooler layer.
num_hidden_layers (:obj:`int`, optional, defaults to 12):
num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
Number of hidden layers in the Transformer encoder.
num_attention_heads (:obj:`int`, optional, defaults to 12):
num_attention_heads (:obj:`int`, `optional`, defaults to 12):
Number of attention heads for each attention layer in the Transformer encoder.
intermediate_size (:obj:`int`, optional, defaults to 3072):
intermediate_size (:obj:`int`, `optional`, defaults to 3072):
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
hidden_act (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler.
If string, "gelu", "relu", "swish" and "gelu_new" are supported.
hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
If string, :obj:`"gelu"`, :obj:`"relu"`, :obj:`"swish"` and :obj:`"gelu_new"` are supported.
hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
The dropout ratio for the attention probabilities.
max_position_embeddings (:obj:`int`, optional, defaults to 512):
max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
The maximum sequence length that this model might ever be used with.
Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
type_vocab_size (:obj:`int`, optional, defaults to 2):
The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
initializer_range (:obj:`float`, optional, defaults to 0.02):
type_vocab_size (:obj:`int`, `optional`, defaults to 2):
The vocabulary size of the :obj:`token_type_ids` passed into :class:`~transformers.LayoutLMModel`.
initializer_range (:obj:`float`, `optional`, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
The epsilon used by the layer normalization layers.
gradient_checkpointing (:obj:`bool`, optional, defaults to :obj:`False`):
gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
If True, use gradient checkpointing to save memory at the expense of slower backward pass.
max_2d_position_embeddings (:obj:`int`, optional, defaults to 1024):
max_2d_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
The maximum value that the 2D position embedding might ever used.
Typically set this to something large just in case (e.g., 1024).
Example::
Examples::
>>> from transformers import LayoutLMModel, LayoutLMConfig
......
......@@ -118,6 +118,7 @@ class LayoutLMEmbeddings(nn.Module):
return embeddings
# Copied from transformers.modeling_bert.BertSelfAttention with Bert->LayoutLM
class LayoutLMSelfAttention(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -172,6 +173,7 @@ class LayoutLMSelfAttention(nn.Module):
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
if attention_mask is not None:
# Apply the attention mask is (precomputed for all layers in LayoutLMModel forward() function)
attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities.
......@@ -195,6 +197,7 @@ class LayoutLMSelfAttention(nn.Module):
return outputs
# Copied from transformers.modeling_bert.BertSelfOutput with Bert->LayoutLM
class LayoutLMSelfOutput(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -209,6 +212,7 @@ class LayoutLMSelfOutput(nn.Module):
return hidden_states
# Copied from transformers.modeling_bert.BertAttention with Bert->LayoutLM
class LayoutLMAttention(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -256,6 +260,7 @@ class LayoutLMAttention(nn.Module):
return outputs
# Copied from transformers.modeling_bert.BertIntermediate
class LayoutLMIntermediate(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -271,6 +276,7 @@ class LayoutLMIntermediate(nn.Module):
return hidden_states
# Copied from transformers.modeling_bert.BertOutput with Bert->LayoutLM
class LayoutLMOutput(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -285,6 +291,7 @@ class LayoutLMOutput(nn.Module):
return hidden_states
# Copied from transformers.modeling_bert.BertLayer with Bert->LayoutLM
class LayoutLMLayer(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -344,6 +351,7 @@ class LayoutLMLayer(nn.Module):
return layer_output
# Copied from transformers.modeling_bert.BertEncoder with Bert->LayoutLM
class LayoutLMEncoder(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -408,6 +416,7 @@ class LayoutLMEncoder(nn.Module):
)
# Copied from transformers.modeling_bert.BertPooler
class LayoutLMPooler(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -423,6 +432,7 @@ class LayoutLMPooler(nn.Module):
return pooled_output
# Copied from transformers.modeling_bert.BertPredictionHeadTransform with Bert->LayoutLM
class LayoutLMPredictionHeadTransform(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -440,6 +450,7 @@ class LayoutLMPredictionHeadTransform(nn.Module):
return hidden_states
# Copied from transformers.modeling_bert.BertLMPredictionHead with Bert->LayoutLM
class LayoutLMLMPredictionHead(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -460,6 +471,7 @@ class LayoutLMLMPredictionHead(nn.Module):
return hidden_states
# Copied from transformers.modeling_bert.BertOnlyMLMHead with Bert->LayoutLM
class LayoutLMOnlyMLMHead(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -470,28 +482,6 @@ class LayoutLMOnlyMLMHead(nn.Module):
return prediction_scores
class LayoutLMOnlyNSPHead(nn.Module):
def __init__(self, config):
super().__init__()
self.seq_relationship = nn.Linear(config.hidden_size, 2)
def forward(self, pooled_output):
seq_relationship_score = self.seq_relationship(pooled_output)
return seq_relationship_score
class LayoutLMPreTrainingHeads(nn.Module):
def __init__(self, config):
super().__init__()
self.predictions = LayoutLMLMPredictionHead(config)
self.seq_relationship = nn.Linear(config.hidden_size, 2)
def forward(self, sequence_output, pooled_output):
prediction_scores = self.predictions(sequence_output)
seq_relationship_score = self.seq_relationship(pooled_output)
return prediction_scores, seq_relationship_score
class LayoutLMPreTrainedModel(PreTrainedModel):
"""An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models.
......
......@@ -142,7 +142,7 @@ class RobertaEmbeddings(nn.Module):
return position_ids.unsqueeze(0).expand(input_shape)
# Copied from transformers.modeling_bert.BertSelfAttention
# Copied from transformers.modeling_bert.BertSelfAttention with Bert->Roberta
class RobertaSelfAttention(nn.Module):
def __init__(self, config):
super().__init__()
......@@ -197,7 +197,7 @@ class RobertaSelfAttention(nn.Module):
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
if attention_mask is not None:
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
# Apply the attention mask is (precomputed for all layers in RobertaModel forward() function)
attention_scores = attention_scores + attention_mask
# Normalize the attention scores to probabilities.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment