Unverified Commit dc17f2a1 authored by Thomas Wolf's avatar Thomas Wolf Committed by GitHub
Browse files

Merge pull request #2538 from huggingface/py3_super

💄 super
parents 88085484 a98b2ca8
......@@ -31,7 +31,7 @@ POOLING_BREAKDOWN = {1: (1, 1), 2: (2, 1), 3: (3, 1), 4: (2, 2), 5: (5, 1), 6: (
class ImageEncoder(nn.Module):
def __init__(self, args):
super(ImageEncoder, self).__init__()
super().__init__()
model = torchvision.models.resnet152(pretrained=True)
modules = list(model.children())[:-2]
self.model = nn.Sequential(*modules)
......
......@@ -5,7 +5,7 @@ class ClassificationHead(torch.nn.Module):
"""Classification Head for transformer encoders"""
def __init__(self, class_size, embed_size):
super(ClassificationHead, self).__init__()
super().__init__()
self.class_size = class_size
self.embed_size = embed_size
# self.mlp1 = torch.nn.Linear(embed_size, embed_size)
......
......@@ -46,7 +46,7 @@ class Discriminator(torch.nn.Module):
"""Transformer encoder followed by a Classification Head"""
def __init__(self, class_size, pretrained_model="gpt2-medium", cached_mode=False, device="cpu"):
super(Discriminator, self).__init__()
super().__init__()
self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model)
self.encoder = GPT2LMHeadModel.from_pretrained(pretrained_model)
self.embed_size = self.encoder.transformer.config.hidden_size
......
......@@ -80,7 +80,7 @@ class BertAbsConfig(PretrainedConfig):
dec_dropout=0.2,
**kwargs,
):
super(BertAbsConfig, self).__init__(**kwargs)
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.max_pos = max_pos
......
......@@ -47,7 +47,7 @@ class BertAbsPreTrainedModel(PreTrainedModel):
class BertAbs(BertAbsPreTrainedModel):
def __init__(self, args, checkpoint=None, bert_extractive_checkpoint=None):
super(BertAbs, self).__init__(args)
super().__init__(args)
self.args = args
self.bert = Bert()
......@@ -122,7 +122,7 @@ class Bert(nn.Module):
"""
def __init__(self):
super(Bert, self).__init__()
super().__init__()
config = BertConfig.from_pretrained("bert-base-uncased")
self.model = BertModel(config)
......@@ -151,7 +151,7 @@ class TransformerDecoder(nn.Module):
"""
def __init__(self, num_layers, d_model, heads, d_ff, dropout, embeddings, vocab_size):
super(TransformerDecoder, self).__init__()
super().__init__()
# Basic attributes.
self.decoder_type = "transformer"
......@@ -261,7 +261,7 @@ class PositionalEncoding(nn.Module):
pe[:, 0::2] = torch.sin(position.float() * div_term)
pe[:, 1::2] = torch.cos(position.float() * div_term)
pe = pe.unsqueeze(0)
super(PositionalEncoding, self).__init__()
super().__init__()
self.register_buffer("pe", pe)
self.dropout = nn.Dropout(p=dropout)
self.dim = dim
......@@ -293,7 +293,7 @@ class TransformerDecoderLayer(nn.Module):
"""
def __init__(self, d_model, heads, d_ff, dropout):
super(TransformerDecoderLayer, self).__init__()
super().__init__()
self.self_attn = MultiHeadedAttention(heads, d_model, dropout=dropout)
......@@ -410,7 +410,7 @@ class MultiHeadedAttention(nn.Module):
self.dim_per_head = model_dim // head_count
self.model_dim = model_dim
super(MultiHeadedAttention, self).__init__()
super().__init__()
self.head_count = head_count
self.linear_keys = nn.Linear(model_dim, head_count * self.dim_per_head)
......@@ -639,7 +639,7 @@ class PositionwiseFeedForward(nn.Module):
"""
def __init__(self, d_model, d_ff, dropout=0.1):
super(PositionwiseFeedForward, self).__init__()
super().__init__()
self.w_1 = nn.Linear(d_model, d_ff)
self.w_2 = nn.Linear(d_ff, d_model)
self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
......
......@@ -122,7 +122,7 @@ class AlbertConfig(PretrainedConfig):
layer_norm_eps=1e-12,
**kwargs
):
super(AlbertConfig, self).__init__(**kwargs)
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.embedding_size = embedding_size
......
......@@ -125,7 +125,7 @@ class BertConfig(PretrainedConfig):
layer_norm_eps=1e-12,
**kwargs
):
super(BertConfig, self).__init__(**kwargs)
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.hidden_size = hidden_size
......
......@@ -106,7 +106,7 @@ class CTRLConfig(PretrainedConfig):
summary_first_dropout=0.1,
**kwargs
):
super(CTRLConfig, self).__init__(**kwargs)
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.n_ctx = n_ctx
self.n_positions = n_positions
......
......@@ -113,7 +113,7 @@ class DistilBertConfig(PretrainedConfig):
seq_classif_dropout=0.2,
**kwargs
):
super(DistilBertConfig, self).__init__(**kwargs)
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.sinusoidal_pos_embds = sinusoidal_pos_embds
......
......@@ -136,7 +136,7 @@ class GPT2Config(PretrainedConfig):
summary_first_dropout=0.1,
**kwargs
):
super(GPT2Config, self).__init__(**kwargs)
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.n_ctx = n_ctx
......
......@@ -138,7 +138,7 @@ class OpenAIGPTConfig(PretrainedConfig):
summary_first_dropout=0.1,
**kwargs
):
super(OpenAIGPTConfig, self).__init__(**kwargs)
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.n_ctx = n_ctx
......
......@@ -77,7 +77,7 @@ class T5Config(PretrainedConfig):
initializer_factor=1.0,
**kwargs
):
super(T5Config, self).__init__(**kwargs)
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.n_positions = n_positions
self.d_model = d_model
......
......@@ -151,7 +151,7 @@ class TransfoXLConfig(PretrainedConfig):
layer_norm_epsilon=1e-5,
**kwargs
):
super(TransfoXLConfig, self).__init__(**kwargs)
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.cutoffs = []
......
......@@ -197,7 +197,7 @@ class XLMConfig(PretrainedConfig):
):
"""Constructs XLMConfig.
"""
super(XLMConfig, self).__init__(**kwargs)
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.emb_dim = emb_dim
self.n_layers = n_layers
......
......@@ -159,7 +159,7 @@ class XLNetConfig(PretrainedConfig):
):
"""Constructs XLNetConfig.
"""
super(XLNetConfig, self).__init__(**kwargs)
super().__init__(**kwargs)
self.vocab_size = vocab_size
self.d_model = d_model
self.n_layer = n_layer
......
......@@ -167,7 +167,7 @@ class AlbertEmbeddings(BertEmbeddings):
"""
def __init__(self, config):
super(AlbertEmbeddings, self).__init__(config)
super().__init__(config)
self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=0)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size)
......@@ -177,7 +177,7 @@ class AlbertEmbeddings(BertEmbeddings):
class AlbertAttention(BertSelfAttention):
def __init__(self, config):
super(AlbertAttention, self).__init__(config)
super().__init__(config)
self.output_attentions = config.output_attentions
self.num_attention_heads = config.num_attention_heads
......@@ -258,7 +258,7 @@ class AlbertAttention(BertSelfAttention):
class AlbertLayer(nn.Module):
def __init__(self, config):
super(AlbertLayer, self).__init__()
super().__init__()
self.config = config
self.full_layer_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
......@@ -279,7 +279,7 @@ class AlbertLayer(nn.Module):
class AlbertLayerGroup(nn.Module):
def __init__(self, config):
super(AlbertLayerGroup, self).__init__()
super().__init__()
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
......@@ -309,7 +309,7 @@ class AlbertLayerGroup(nn.Module):
class AlbertTransformer(nn.Module):
def __init__(self, config):
super(AlbertTransformer, self).__init__()
super().__init__()
self.config = config
self.output_attentions = config.output_attentions
......@@ -471,7 +471,7 @@ class AlbertModel(AlbertPreTrainedModel):
base_model_prefix = "albert"
def __init__(self, config):
super(AlbertModel, self).__init__(config)
super().__init__(config)
self.config = config
self.embeddings = AlbertEmbeddings(config)
......@@ -571,7 +571,7 @@ class AlbertModel(AlbertPreTrainedModel):
class AlbertMLMHead(nn.Module):
def __init__(self, config):
super(AlbertMLMHead, self).__init__()
super().__init__()
self.LayerNorm = nn.LayerNorm(config.embedding_size)
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
......@@ -619,7 +619,7 @@ class AlbertForMaskedLM(AlbertPreTrainedModel):
"""
def __init__(self, config):
super(AlbertForMaskedLM, self).__init__(config)
super().__init__(config)
self.albert = AlbertModel(config)
self.predictions = AlbertMLMHead(config)
......@@ -706,7 +706,7 @@ class AlbertForSequenceClassification(AlbertPreTrainedModel):
"""
def __init__(self, config):
super(AlbertForSequenceClassification, self).__init__(config)
super().__init__(config)
self.num_labels = config.num_labels
self.albert = AlbertModel(config)
......@@ -804,7 +804,7 @@ class AlbertForQuestionAnswering(AlbertPreTrainedModel):
"""
def __init__(self, config):
super(AlbertForQuestionAnswering, self).__init__(config)
super().__init__(config)
self.num_labels = config.num_labels
self.albert = AlbertModel(config)
......
......@@ -160,7 +160,7 @@ class BertEmbeddings(nn.Module):
"""
def __init__(self, config):
super(BertEmbeddings, self).__init__()
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=0)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
......@@ -197,7 +197,7 @@ class BertEmbeddings(nn.Module):
class BertSelfAttention(nn.Module):
def __init__(self, config):
super(BertSelfAttention, self).__init__()
super().__init__()
if config.hidden_size % config.num_attention_heads != 0:
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
......@@ -275,7 +275,7 @@ class BertSelfAttention(nn.Module):
class BertSelfOutput(nn.Module):
def __init__(self, config):
super(BertSelfOutput, self).__init__()
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
......@@ -289,7 +289,7 @@ class BertSelfOutput(nn.Module):
class BertAttention(nn.Module):
def __init__(self, config):
super(BertAttention, self).__init__()
super().__init__()
self.self = BertSelfAttention(config)
self.output = BertSelfOutput(config)
self.pruned_heads = set()
......@@ -335,7 +335,7 @@ class BertAttention(nn.Module):
class BertIntermediate(nn.Module):
def __init__(self, config):
super(BertIntermediate, self).__init__()
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
if isinstance(config.hidden_act, str):
self.intermediate_act_fn = ACT2FN[config.hidden_act]
......@@ -350,7 +350,7 @@ class BertIntermediate(nn.Module):
class BertOutput(nn.Module):
def __init__(self, config):
super(BertOutput, self).__init__()
super().__init__()
self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
......@@ -364,7 +364,7 @@ class BertOutput(nn.Module):
class BertLayer(nn.Module):
def __init__(self, config):
super(BertLayer, self).__init__()
super().__init__()
self.attention = BertAttention(config)
self.is_decoder = config.is_decoder
if self.is_decoder:
......@@ -399,7 +399,7 @@ class BertLayer(nn.Module):
class BertEncoder(nn.Module):
def __init__(self, config):
super(BertEncoder, self).__init__()
super().__init__()
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
......@@ -440,7 +440,7 @@ class BertEncoder(nn.Module):
class BertPooler(nn.Module):
def __init__(self, config):
super(BertPooler, self).__init__()
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.activation = nn.Tanh()
......@@ -455,7 +455,7 @@ class BertPooler(nn.Module):
class BertPredictionHeadTransform(nn.Module):
def __init__(self, config):
super(BertPredictionHeadTransform, self).__init__()
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
if isinstance(config.hidden_act, str):
self.transform_act_fn = ACT2FN[config.hidden_act]
......@@ -472,7 +472,7 @@ class BertPredictionHeadTransform(nn.Module):
class BertLMPredictionHead(nn.Module):
def __init__(self, config):
super(BertLMPredictionHead, self).__init__()
super().__init__()
self.transform = BertPredictionHeadTransform(config)
# The output weights are the same as the input embeddings, but there is
......@@ -492,7 +492,7 @@ class BertLMPredictionHead(nn.Module):
class BertOnlyMLMHead(nn.Module):
def __init__(self, config):
super(BertOnlyMLMHead, self).__init__()
super().__init__()
self.predictions = BertLMPredictionHead(config)
def forward(self, sequence_output):
......@@ -502,7 +502,7 @@ class BertOnlyMLMHead(nn.Module):
class BertOnlyNSPHead(nn.Module):
def __init__(self, config):
super(BertOnlyNSPHead, self).__init__()
super().__init__()
self.seq_relationship = nn.Linear(config.hidden_size, 2)
def forward(self, pooled_output):
......@@ -512,7 +512,7 @@ class BertOnlyNSPHead(nn.Module):
class BertPreTrainingHeads(nn.Module):
def __init__(self, config):
super(BertPreTrainingHeads, self).__init__()
super().__init__()
self.predictions = BertLMPredictionHead(config)
self.seq_relationship = nn.Linear(config.hidden_size, 2)
......@@ -657,7 +657,7 @@ class BertModel(BertPreTrainedModel):
"""
def __init__(self, config):
super(BertModel, self).__init__(config)
super().__init__(config)
self.config = config
self.embeddings = BertEmbeddings(config)
......@@ -864,7 +864,7 @@ class BertForPreTraining(BertPreTrainedModel):
"""
def __init__(self, config):
super(BertForPreTraining, self).__init__(config)
super().__init__(config)
self.bert = BertModel(config)
self.cls = BertPreTrainingHeads(config)
......@@ -954,7 +954,7 @@ class BertForMaskedLM(BertPreTrainedModel):
"""
def __init__(self, config):
super(BertForMaskedLM, self).__init__(config)
super().__init__(config)
self.bert = BertModel(config)
self.cls = BertOnlyMLMHead(config)
......@@ -1053,7 +1053,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
"""
def __init__(self, config):
super(BertForNextSentencePrediction, self).__init__(config)
super().__init__(config)
self.bert = BertModel(config)
self.cls = BertOnlyNSPHead(config)
......@@ -1132,7 +1132,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
"""
def __init__(self, config):
super(BertForSequenceClassification, self).__init__(config)
super().__init__(config)
self.num_labels = config.num_labels
self.bert = BertModel(config)
......@@ -1221,7 +1221,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
"""
def __init__(self, config):
super(BertForMultipleChoice, self).__init__(config)
super().__init__(config)
self.bert = BertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
......@@ -1308,7 +1308,7 @@ class BertForTokenClassification(BertPreTrainedModel):
"""
def __init__(self, config):
super(BertForTokenClassification, self).__init__(config)
super().__init__(config)
self.num_labels = config.num_labels
self.bert = BertModel(config)
......@@ -1406,7 +1406,7 @@ class BertForQuestionAnswering(BertPreTrainedModel):
"""
def __init__(self, config):
super(BertForQuestionAnswering, self).__init__(config)
super().__init__(config)
self.num_labels = config.num_labels
self.bert = BertModel(config)
......
......@@ -81,7 +81,7 @@ def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=N
class MultiHeadAttention(torch.nn.Module):
def __init__(self, d_model_size, num_heads, output_attentions=False):
super(MultiHeadAttention, self).__init__()
super().__init__()
self.output_attentions = output_attentions
self.num_heads = num_heads
self.d_model_size = d_model_size
......@@ -132,7 +132,7 @@ def point_wise_feed_forward_network(d_model_size, dff):
class EncoderLayer(torch.nn.Module):
def __init__(self, d_model_size, num_heads, dff, rate=0.1, output_attentions=False):
super(EncoderLayer, self).__init__()
super().__init__()
self.multi_head_attention = MultiHeadAttention(d_model_size, num_heads, output_attentions)
self.ffn = point_wise_feed_forward_network(d_model_size, dff)
......@@ -274,7 +274,7 @@ class CTRLModel(CTRLPreTrainedModel):
"""
def __init__(self, config):
super(CTRLModel, self).__init__(config)
super().__init__(config)
self.output_hidden_states = config.output_hidden_states
self.output_attentions = config.output_attentions
self.output_past = config.output_past
......@@ -481,7 +481,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel):
"""
def __init__(self, config):
super(CTRLLMHeadModel, self).__init__(config)
super().__init__(config)
self.transformer = CTRLModel(config)
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=True)
......
......@@ -59,7 +59,7 @@ def create_sinusoidal_embeddings(n_pos, dim, out):
class Embeddings(nn.Module):
def __init__(self, config):
super(Embeddings, self).__init__()
super().__init__()
self.word_embeddings = nn.Embedding(config.vocab_size, config.dim, padding_idx=0)
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.dim)
if config.sinusoidal_pos_embds:
......@@ -97,7 +97,7 @@ class Embeddings(nn.Module):
class MultiHeadSelfAttention(nn.Module):
def __init__(self, config):
super(MultiHeadSelfAttention, self).__init__()
super().__init__()
self.n_heads = config.n_heads
self.dim = config.dim
......@@ -195,7 +195,7 @@ class MultiHeadSelfAttention(nn.Module):
class FFN(nn.Module):
def __init__(self, config):
super(FFN, self).__init__()
super().__init__()
self.dropout = nn.Dropout(p=config.dropout)
self.lin1 = nn.Linear(in_features=config.dim, out_features=config.hidden_dim)
self.lin2 = nn.Linear(in_features=config.hidden_dim, out_features=config.dim)
......@@ -214,7 +214,7 @@ class FFN(nn.Module):
class TransformerBlock(nn.Module):
def __init__(self, config):
super(TransformerBlock, self).__init__()
super().__init__()
self.n_heads = config.n_heads
self.dim = config.dim
......@@ -266,7 +266,7 @@ class TransformerBlock(nn.Module):
class Transformer(nn.Module):
def __init__(self, config):
super(Transformer, self).__init__()
super().__init__()
self.n_layers = config.n_layers
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
......@@ -424,7 +424,7 @@ class DistilBertModel(DistilBertPreTrainedModel):
"""
def __init__(self, config):
super(DistilBertModel, self).__init__(config)
super().__init__(config)
self.embeddings = Embeddings(config) # Embeddings
self.transformer = Transformer(config) # Encoder
......@@ -525,7 +525,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
"""
def __init__(self, config):
super(DistilBertForMaskedLM, self).__init__(config)
super().__init__(config)
self.output_attentions = config.output_attentions
self.output_hidden_states = config.output_hidden_states
......@@ -600,7 +600,7 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
"""
def __init__(self, config):
super(DistilBertForSequenceClassification, self).__init__(config)
super().__init__(config)
self.num_labels = config.num_labels
self.distilbert = DistilBertModel(config)
......@@ -679,7 +679,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
"""
def __init__(self, config):
super(DistilBertForQuestionAnswering, self).__init__(config)
super().__init__(config)
self.distilbert = DistilBertModel(config)
self.qa_outputs = nn.Linear(config.dim, config.num_labels)
......@@ -766,7 +766,7 @@ class DistilBertForTokenClassification(DistilBertPreTrainedModel):
"""
def __init__(self, config):
super(DistilBertForTokenClassification, self).__init__(config)
super().__init__(config)
self.num_labels = config.num_labels
self.distilbert = DistilBertModel(config)
......
......@@ -37,7 +37,7 @@ class PreTrainedEncoderDecoder(nn.Module):
"""
def __init__(self, encoder, decoder):
super(PreTrainedEncoderDecoder, self).__init__()
super().__init__()
self.encoder = encoder
self.decoder = decoder
......@@ -290,7 +290,7 @@ class Model2Model(PreTrainedEncoderDecoder):
"""
def __init__(self, *args, **kwargs):
super(Model2Model, self).__init__(*args, **kwargs)
super().__init__(*args, **kwargs)
self.tie_weights()
def tie_weights(self):
......@@ -321,7 +321,7 @@ class Model2Model(PreTrainedEncoderDecoder):
):
raise ValueError("Only the Bert model is currently supported.")
model = super(Model2Model, cls).from_pretrained(
model = super().from_pretrained(
encoder_pretrained_model_name_or_path=pretrained_model_name_or_path,
decoder_pretrained_model_name_or_path=pretrained_model_name_or_path,
*args,
......@@ -345,5 +345,5 @@ class Model2LSTM(PreTrainedEncoderDecoder):
" E.g. `decoder_config={'input_size': 768, 'hidden_size': 768, 'num_layers': 2}`"
)
kwargs["decoder_model"] = torch.nn.LSTM(kwargs.pop("decoder_config"))
model = super(Model2LSTM, cls).from_pretrained(*args, **kwargs)
model = super().from_pretrained(*args, **kwargs)
return model
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment