Commit 982f181a authored by erenup's avatar erenup
Browse files

Merge remote-tracking branch 'origin/master' into run_multiple_choice_add_doc

parents 603b470a 84b9d1c4
...@@ -21,7 +21,7 @@ from io import open ...@@ -21,7 +21,7 @@ from io import open
import torch import torch
from pytorch_transformers.modeling_openai import (CONFIG_NAME, WEIGHTS_NAME, from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
OpenAIGPTConfig, OpenAIGPTConfig,
OpenAIGPTModel, OpenAIGPTModel,
load_tf_weights_in_openai_gpt) load_tf_weights_in_openai_gpt)
......
...@@ -20,7 +20,7 @@ import argparse ...@@ -20,7 +20,7 @@ import argparse
import torch import torch
import numpy as np import numpy as np
import tensorflow as tf import tensorflow as tf
from pytorch_transformers.modeling import BertModel from pytorch_transformers import BertModel
def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str): def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str):
......
...@@ -23,12 +23,12 @@ import torch ...@@ -23,12 +23,12 @@ import torch
from fairseq.models.roberta import RobertaModel as FairseqRobertaModel from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
from fairseq.modules import TransformerSentenceEncoderLayer from fairseq.modules import TransformerSentenceEncoderLayer
from pytorch_transformers.modeling_bert import (BertConfig, BertEncoder, from pytorch_transformers import (BertConfig, BertEncoder,
BertIntermediate, BertLayer, BertIntermediate, BertLayer,
BertModel, BertOutput, BertModel, BertOutput,
BertSelfAttention, BertSelfAttention,
BertSelfOutput) BertSelfOutput)
from pytorch_transformers.modeling_roberta import (RobertaEmbeddings, from pytorch_transformers import (RobertaEmbeddings,
RobertaForMaskedLM, RobertaForMaskedLM,
RobertaForSequenceClassification, RobertaForSequenceClassification,
RobertaModel) RobertaModel)
...@@ -53,6 +53,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ ...@@ -53,6 +53,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
intermediate_size=roberta.args.encoder_ffn_embed_dim, intermediate_size=roberta.args.encoder_ffn_embed_dim,
max_position_embeddings=514, max_position_embeddings=514,
type_vocab_size=1, type_vocab_size=1,
layer_norm_eps=1e-5, # PyTorch default used in fairseq
) )
if classification_head: if classification_head:
config.num_labels = roberta.args.num_classes config.num_labels = roberta.args.num_classes
...@@ -69,7 +70,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ ...@@ -69,7 +70,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(model.roberta.embeddings.token_type_embeddings.weight) # just zero them out b/c RoBERTa doesn't use them. model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(model.roberta.embeddings.token_type_embeddings.weight) # just zero them out b/c RoBERTa doesn't use them.
model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight
model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias
model.roberta.embeddings.LayerNorm.variance_epsilon = roberta_sent_encoder.emb_layer_norm.eps
for i in range(config.num_hidden_layers): for i in range(config.num_hidden_layers):
# Encoder: start of layer # Encoder: start of layer
...@@ -98,7 +98,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ ...@@ -98,7 +98,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
self_output.dense.bias = roberta_layer.self_attn.out_proj.bias self_output.dense.bias = roberta_layer.self_attn.out_proj.bias
self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight
self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias
self_output.LayerNorm.variance_epsilon = roberta_layer.self_attn_layer_norm.eps
### intermediate ### intermediate
intermediate: BertIntermediate = layer.intermediate intermediate: BertIntermediate = layer.intermediate
...@@ -117,7 +116,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ ...@@ -117,7 +116,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
bert_output.dense.bias = roberta_layer.fc2.bias bert_output.dense.bias = roberta_layer.fc2.bias
bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight
bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias
bert_output.LayerNorm.variance_epsilon = roberta_layer.final_layer_norm.eps
#### end of layer #### end of layer
if classification_head: if classification_head:
...@@ -131,7 +129,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ ...@@ -131,7 +129,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
model.lm_head.dense.bias = roberta.model.decoder.lm_head.dense.bias model.lm_head.dense.bias = roberta.model.decoder.lm_head.dense.bias
model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight
model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias
model.lm_head.layer_norm.variance_epsilon = roberta.model.decoder.lm_head.layer_norm.eps
model.lm_head.decoder.weight = roberta.model.decoder.lm_head.weight model.lm_head.decoder.weight = roberta.model.decoder.lm_head.weight
model.lm_head.bias = roberta.model.decoder.lm_head.bias model.lm_head.bias = roberta.model.decoder.lm_head.bias
...@@ -144,6 +141,8 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ ...@@ -144,6 +141,8 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
else: else:
their_output = roberta.model(input_ids)[0] their_output = roberta.model(input_ids)[0]
print(our_output.shape, their_output.shape) print(our_output.shape, their_output.shape)
max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7
success = torch.allclose(our_output, their_output, atol=1e-3) success = torch.allclose(our_output, their_output, atol=1e-3)
print( print(
"Do both models output the same tensors?", "Do both models output the same tensors?",
......
...@@ -21,7 +21,7 @@ from __future__ import print_function ...@@ -21,7 +21,7 @@ from __future__ import print_function
import argparse import argparse
import torch import torch
from pytorch_transformers.modeling_bert import BertConfig, BertForPreTraining, load_tf_weights_in_bert from pytorch_transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
import logging import logging
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
......
...@@ -26,7 +26,7 @@ import torch ...@@ -26,7 +26,7 @@ import torch
import pytorch_transformers.tokenization_transfo_xl as data_utils import pytorch_transformers.tokenization_transfo_xl as data_utils
from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
from pytorch_transformers.modeling_transfo_xl import (TransfoXLConfig, TransfoXLLMHeadModel, from pytorch_transformers import (TransfoXLConfig, TransfoXLLMHeadModel,
load_tf_weights_in_transfo_xl) load_tf_weights_in_transfo_xl)
from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES) from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)
......
...@@ -23,7 +23,7 @@ from io import open ...@@ -23,7 +23,7 @@ from io import open
import torch import torch
import numpy import numpy
from pytorch_transformers.modeling_utils import CONFIG_NAME, WEIGHTS_NAME from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
from pytorch_transformers.tokenization_xlm import VOCAB_FILES_NAMES from pytorch_transformers.tokenization_xlm import VOCAB_FILES_NAMES
import logging import logging
......
...@@ -22,7 +22,7 @@ import os ...@@ -22,7 +22,7 @@ import os
import argparse import argparse
import torch import torch
from pytorch_transformers.modeling_xlnet import (CONFIG_NAME, WEIGHTS_NAME, from pytorch_transformers import (CONFIG_NAME, WEIGHTS_NAME,
XLNetConfig, XLNetConfig,
XLNetLMHeadModel, XLNetForQuestionAnswering, XLNetLMHeadModel, XLNetForQuestionAnswering,
XLNetForSequenceClassification, XLNetForSequenceClassification,
......
...@@ -9,6 +9,7 @@ import sys ...@@ -9,6 +9,7 @@ import sys
import json import json
import logging import logging
import os import os
import six
import shutil import shutil
import tempfile import tempfile
import fnmatch import fnmatch
...@@ -47,8 +48,35 @@ except (AttributeError, ImportError): ...@@ -47,8 +48,35 @@ except (AttributeError, ImportError):
PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE # Kept for backward compatibility
WEIGHTS_NAME = "pytorch_model.bin"
TF_WEIGHTS_NAME = 'model.ckpt'
CONFIG_NAME = "config.json"
logger = logging.getLogger(__name__) # pylint: disable=invalid-name logger = logging.getLogger(__name__) # pylint: disable=invalid-name
if not six.PY2:
def add_start_docstrings(*docstr):
def docstring_decorator(fn):
fn.__doc__ = ''.join(docstr) + fn.__doc__
return fn
return docstring_decorator
def add_end_docstrings(*docstr):
def docstring_decorator(fn):
fn.__doc__ = fn.__doc__ + ''.join(docstr)
return fn
return docstring_decorator
else:
# Not possible to update class docstrings on python2
def add_start_docstrings(*docstr):
def docstring_decorator(fn):
return fn
return docstring_decorator
def add_end_docstrings(*docstr):
def docstring_decorator(fn):
return fn
return docstring_decorator
def url_to_filename(url, etag=None): def url_to_filename(url, etag=None):
""" """
......
This diff is collapsed.
This diff is collapsed.
...@@ -31,7 +31,9 @@ import numpy as np ...@@ -31,7 +31,9 @@ import numpy as np
import torch import torch
import torch.nn as nn import torch.nn as nn
from pytorch_transformers.modeling_utils import PretrainedConfig, PreTrainedModel, add_start_docstrings, prune_linear_layer from .modeling_utils import PreTrainedModel, prune_linear_layer
from .configuration_distilbert import DistilBertConfig
from .file_utils import add_start_docstrings
import logging import logging
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -42,69 +44,6 @@ DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = { ...@@ -42,69 +44,6 @@ DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin" 'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-pytorch_model.bin"
} }
DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
'distilbert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-config.json",
'distilbert-base-uncased-distilled-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-distilled-squad-config.json"
}
class DistilBertConfig(PretrainedConfig):
pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
def __init__(self,
vocab_size_or_config_json_file=30522,
max_position_embeddings=512,
sinusoidal_pos_embds=True,
n_layers=6,
n_heads=12,
dim=768,
hidden_dim=4*768,
dropout=0.1,
attention_dropout=0.1,
activation='gelu',
initializer_range=0.02,
tie_weights_=True,
qa_dropout=0.1,
seq_classif_dropout=0.2,
**kwargs):
super(DistilBertConfig, self).__init__(**kwargs)
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
and isinstance(vocab_size_or_config_json_file, unicode)):
with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
json_config = json.loads(reader.read())
for key, value in json_config.items():
self.__dict__[key] = value
elif isinstance(vocab_size_or_config_json_file, int):
self.vocab_size = vocab_size_or_config_json_file
self.max_position_embeddings = max_position_embeddings
self.sinusoidal_pos_embds = sinusoidal_pos_embds
self.n_layers = n_layers
self.n_heads = n_heads
self.dim = dim
self.hidden_dim = hidden_dim
self.dropout = dropout
self.attention_dropout = attention_dropout
self.activation = activation
self.initializer_range = initializer_range
self.tie_weights_ = tie_weights_
self.qa_dropout = qa_dropout
self.seq_classif_dropout = seq_classif_dropout
else:
raise ValueError("First argument must be either a vocabulary size (int)"
" or the path to a pretrained model config file (str)")
@property
def hidden_size(self):
return self.dim
@property
def num_attention_heads(self):
return self.n_heads
@property
def num_hidden_layers(self):
return self.n_layers
### UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE ### ### UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE ###
def gelu(x): def gelu(x):
...@@ -174,12 +113,16 @@ class MultiHeadSelfAttention(nn.Module): ...@@ -174,12 +113,16 @@ class MultiHeadSelfAttention(nn.Module):
self.v_lin = nn.Linear(in_features=config.dim, out_features=config.dim) self.v_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
self.out_lin = nn.Linear(in_features=config.dim, out_features=config.dim) self.out_lin = nn.Linear(in_features=config.dim, out_features=config.dim)
self.pruned_heads = set()
def prune_heads(self, heads): def prune_heads(self, heads):
attention_head_size = self.dim // self.n_heads attention_head_size = self.dim // self.n_heads
if len(heads) == 0: if len(heads) == 0:
return return
mask = torch.ones(self.n_heads, attention_head_size) mask = torch.ones(self.n_heads, attention_head_size)
heads = set(heads) - self.pruned_heads
for head in heads: for head in heads:
head -= sum(1 if h < head else 0 for h in self.pruned_heads)
mask[head] = 0 mask[head] = 0
mask = mask.view(-1).contiguous().eq(1) mask = mask.view(-1).contiguous().eq(1)
index = torch.arange(len(mask))[mask].long() index = torch.arange(len(mask))[mask].long()
...@@ -191,6 +134,7 @@ class MultiHeadSelfAttention(nn.Module): ...@@ -191,6 +134,7 @@ class MultiHeadSelfAttention(nn.Module):
# Update hyper params # Update hyper params
self.n_heads = self.n_heads - len(heads) self.n_heads = self.n_heads - len(heads)
self.dim = attention_head_size * self.n_heads self.dim = attention_head_size * self.n_heads
self.pruned_heads = self.pruned_heads.union(heads)
def forward(self, query, key, value, mask, head_mask = None): def forward(self, query, key, value, mask, head_mask = None):
""" """
...@@ -395,7 +339,7 @@ class DistilBertPreTrainedModel(PreTrainedModel): ...@@ -395,7 +339,7 @@ class DistilBertPreTrainedModel(PreTrainedModel):
def __init__(self, *inputs, **kwargs): def __init__(self, *inputs, **kwargs):
super(DistilBertPreTrainedModel, self).__init__(*inputs, **kwargs) super(DistilBertPreTrainedModel, self).__init__(*inputs, **kwargs)
def init_weights(self, module): def _init_weights(self, module):
""" Initialize the weights. """ Initialize the weights.
""" """
if isinstance(module, nn.Embedding): if isinstance(module, nn.Embedding):
...@@ -480,7 +424,7 @@ class DistilBertModel(DistilBertPreTrainedModel): ...@@ -480,7 +424,7 @@ class DistilBertModel(DistilBertPreTrainedModel):
self.embeddings = Embeddings(config) # Embeddings self.embeddings = Embeddings(config) # Embeddings
self.transformer = Transformer(config) # Encoder self.transformer = Transformer(config) # Encoder
self.apply(self.init_weights) self.init_weights()
def _resize_token_embeddings(self, new_num_tokens): def _resize_token_embeddings(self, new_num_tokens):
old_embeddings = self.embeddings.word_embeddings old_embeddings = self.embeddings.word_embeddings
...@@ -568,7 +512,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel): ...@@ -568,7 +512,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
self.vocab_layer_norm = nn.LayerNorm(config.dim, eps=1e-12) self.vocab_layer_norm = nn.LayerNorm(config.dim, eps=1e-12)
self.vocab_projector = nn.Linear(config.dim, config.vocab_size) self.vocab_projector = nn.Linear(config.dim, config.vocab_size)
self.apply(self.init_weights) self.init_weights()
self.tie_weights() self.tie_weights()
self.mlm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1) self.mlm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
...@@ -580,7 +524,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel): ...@@ -580,7 +524,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel):
self._tie_or_clone_weights(self.vocab_projector, self._tie_or_clone_weights(self.vocab_projector,
self.distilbert.embeddings.word_embeddings) self.distilbert.embeddings.word_embeddings)
def forward(self, input_ids, attention_mask=None, masked_lm_labels=None, head_mask=None): def forward(self, input_ids, attention_mask=None, head_mask=None, masked_lm_labels=None):
dlbrt_output = self.distilbert(input_ids=input_ids, dlbrt_output = self.distilbert(input_ids=input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
head_mask=head_mask) head_mask=head_mask)
...@@ -642,9 +586,9 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel): ...@@ -642,9 +586,9 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel):
self.classifier = nn.Linear(config.dim, config.num_labels) self.classifier = nn.Linear(config.dim, config.num_labels)
self.dropout = nn.Dropout(config.seq_classif_dropout) self.dropout = nn.Dropout(config.seq_classif_dropout)
self.apply(self.init_weights) self.init_weights()
def forward(self, input_ids, attention_mask=None, labels=None, head_mask=None): def forward(self, input_ids, attention_mask=None, head_mask=None, labels=None):
distilbert_output = self.distilbert(input_ids=input_ids, distilbert_output = self.distilbert(input_ids=input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
head_mask=head_mask) head_mask=head_mask)
...@@ -716,9 +660,9 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel): ...@@ -716,9 +660,9 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
assert config.num_labels == 2 assert config.num_labels == 2
self.dropout = nn.Dropout(config.qa_dropout) self.dropout = nn.Dropout(config.qa_dropout)
self.apply(self.init_weights) self.init_weights()
def forward(self, input_ids, attention_mask=None, start_positions=None, end_positions=None, head_mask=None): def forward(self, input_ids, attention_mask=None, head_mask=None, start_positions=None, end_positions=None):
distilbert_output = self.distilbert(input_ids=input_ids, distilbert_output = self.distilbert(input_ids=input_ids,
attention_mask=attention_mask, attention_mask=attention_mask,
head_mask=head_mask) head_mask=head_mask)
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment