Commit 7d03c537 authored by thomwolf's avatar thomwolf
Browse files

conversion working

parent 3a9c8837
...@@ -120,3 +120,6 @@ dmypy.json ...@@ -120,3 +120,6 @@ dmypy.json
# TF code # TF code
tensorflow_code tensorflow_code
# models
models
\ No newline at end of file
...@@ -3,9 +3,14 @@ def main(): ...@@ -3,9 +3,14 @@ def main():
import sys import sys
if (len(sys.argv) != 4 and len(sys.argv) != 5) or sys.argv[1] not in [ if (len(sys.argv) != 4 and len(sys.argv) != 5) or sys.argv[1] not in [
"convert_tf_checkpoint_to_pytorch", "convert_tf_checkpoint_to_pytorch",
"convert_openai_checkpoint" "convert_openai_checkpoint",
"convert_transfo_xl_checkpoint"
]: ]:
print("Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT` \n or `pytorch_pretrained_bert convert_openai_checkpoint OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`") print(
"Should be used as"
"`pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`, \n"
"`pytorch_pretrained_bert convert_openai_checkpoint OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]` or \n"
"`pytorch_pretrained_bert convert_transfo_xl_checkpoint TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
else: else:
if sys.argv[1] == "convert_tf_checkpoint_to_pytorch": if sys.argv[1] == "convert_tf_checkpoint_to_pytorch":
try: try:
...@@ -24,7 +29,7 @@ def main(): ...@@ -24,7 +29,7 @@ def main():
TF_CONFIG = sys.argv.pop() TF_CONFIG = sys.argv.pop()
TF_CHECKPOINT = sys.argv.pop() TF_CHECKPOINT = sys.argv.pop()
convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT) convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
else: elif sys.argv[1] == "convert_openai_checkpoint":
from .convert_openai_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch from .convert_openai_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2] OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]
PYTORCH_DUMP_OUTPUT = sys.argv[3] PYTORCH_DUMP_OUTPUT = sys.argv[3]
...@@ -35,6 +40,22 @@ def main(): ...@@ -35,6 +40,22 @@ def main():
convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH, convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH,
OPENAI_GPT_CONFIG, OPENAI_GPT_CONFIG,
PYTORCH_DUMP_OUTPUT) PYTORCH_DUMP_OUTPUT)
else:
try:
from .convert_transfo_xl_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
except ModuleNotFoundError:
print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
"In that case, it requires TensorFlow to be installed. Please see "
"https://www.tensorflow.org/install/ for installation instructions.")
raise
TF_CHECKPOINT = sys.argv[2]
PYTORCH_DUMP_OUTPUT = sys.argv[3]
if len(sys.argv) == 5:
TF_CONFIG = sys.argv[4]
else:
TF_CONFIG = ""
convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
if __name__ == '__main__': if __name__ == '__main__':
main() main()
...@@ -18,11 +18,9 @@ from __future__ import absolute_import ...@@ -18,11 +18,9 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import os
import re import re
import json import json
import argparse import argparse
import tensorflow as tf
import torch import torch
import numpy as np import numpy as np
......
...@@ -25,7 +25,72 @@ import tensorflow as tf ...@@ -25,7 +25,72 @@ import tensorflow as tf
import torch import torch
import numpy as np import numpy as np
from .modeling_transfo_xl import TransfoXLConfig, TransfoXLModel, CONFIG_NAME, WEIGHTS_NAME from pytorch_pretrained_bert.modeling_transfo_xl import TransfoXLConfig, TransfoXLModel, CONFIG_NAME, WEIGHTS_NAME
def build_tf_to_pytorch_map(model, config):
""" A map of modules from TF to PyTorch """
tf_to_pt_map = {}
# Embeddings cutoffs
for i, (embed_l, proj_l) in enumerate(zip(model.word_emb.emb_layers, model.word_emb.emb_projs)):
layer_str = "transformer/adaptive_embed/cutoff_%d/" % i
tf_to_pt_map.update({
layer_str + 'lookup_table': embed_l.weight,
layer_str + 'proj_W': proj_l
})
# Transformer blocks
for i, b in enumerate(model.layers):
layer_str = "transformer/layer_%d/" % i
tf_to_pt_map.update({
layer_str + "rel_attn/LayerNorm/gamma": b.dec_attn.layer_norm.weight,
layer_str + "rel_attn/LayerNorm/beta": b.dec_attn.layer_norm.bias,
layer_str + "rel_attn/o/kernel": b.dec_attn.o_net.weight,
layer_str + "rel_attn/qkv/kernel": b.dec_attn.qkv_net.weight,
layer_str + "rel_attn/r/kernel": b.dec_attn.r_net.weight,
layer_str + "ff/LayerNorm/gamma": b.pos_ff.layer_norm.weight,
layer_str + "ff/LayerNorm/beta": b.pos_ff.layer_norm.bias,
layer_str + "ff/layer_1/kernel": b.pos_ff.CoreNet[0].weight,
layer_str + "ff/layer_1/bias": b.pos_ff.CoreNet[0].bias,
layer_str + "ff/layer_2/kernel": b.pos_ff.CoreNet[3].weight,
layer_str + "ff/layer_2/bias": b.pos_ff.CoreNet[3].bias,
})
# Softmax cutoffs
for i, (out_l, proj_l, tie_proj) in enumerate(zip(
model.crit.out_layers,
model.crit.out_projs,
config.tie_projs)):
layer_str = "transformer/adaptive_softmax/cutoff_%d/" % i
if config.tie_weight:
tf_to_pt_map.update({
layer_str + 'b': out_l.bias})
else:
raise NotImplementedError
# I don't think this is implemented in the TF code
tf_to_pt_map.update({
layer_str + 'lookup_table': out_l.weight,
layer_str + 'b': out_l.bias})
if not tie_proj:
tf_to_pt_map.update({
layer_str + 'proj': proj_l
})
# Relative positioning biases
if config.untie_r:
layer_str = "transformer/r_r_bias"
layer_str_2 = "transformer/r_w_bias"
r_r_list = []
r_w_list = []
for b in model.layers:
r_r_list.append(b.dec_attn.r_r_bias)
r_w_list.append(b.dec_attn.r_w_bias)
else:
r_r_list = [model.r_r_bias]
r_w_list = [model.r_w_bias]
tf_to_pt_map.update({
'transformer/r_r_bias': r_r_list,
'transformer/r_w_bias': r_w_list})
return tf_to_pt_map
def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path, def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
...@@ -35,16 +100,6 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path, ...@@ -35,16 +100,6 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
tf_path = os.path.abspath(tf_checkpoint_path) tf_path = os.path.abspath(tf_checkpoint_path)
print("Converting Transformer XL checkpoint from {} with config at {}".format(tf_path, config_path)) print("Converting Transformer XL checkpoint from {} with config at {}".format(tf_path, config_path))
# Load weights from TF model
init_vars = tf.train.list_variables(tf_path)
names = []
arrays = []
for name, shape in init_vars:
print("Loading TF weight {} with shape {}".format(name, shape))
array = tf.train.load_variable(tf_path, name)
names.append(name)
arrays.append(array)
# Initialise PyTorch model # Initialise PyTorch model
# Construct model # Construct model
if transfo_xl_config_file == "": if transfo_xl_config_file == "":
...@@ -54,34 +109,37 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path, ...@@ -54,34 +109,37 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
print("Building PyTorch model from configuration: {}".format(str(config))) print("Building PyTorch model from configuration: {}".format(str(config)))
model = TransfoXLModel(config) model = TransfoXLModel(config)
for name, array in zip(names, arrays): # Build TF to PyTorch weights loading map
name = name.split('/') tf_to_pt_map = build_tf_to_pytorch_map(model.transformer, config)
# Load weights from TF model
init_vars = tf.train.list_variables(tf_path)
tf_weights = {}
for name, shape in init_vars:
print("Loading TF weight {} with shape {}".format(name, shape))
array = tf.train.load_variable(tf_path, name)
tf_weights[name] = array
for name, pointer in tf_to_pt_map.items():
assert name in tf_weights
array = tf_weights[name]
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# which are not required for using pretrained model # which are not required for using pretrained model
if any(n in ["adam_v", "adam_m"] for n in name): if 'kernel' in name or 'proj_W' in name:
print("Skipping {}".format("/".join(name)))
continue
pointer = model
for m_name in name:
if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
l = re.split(r'_(\d+)', m_name)
else:
l = [m_name]
if l[0] == 'kernel' or l[0] == 'gamma':
pointer = getattr(pointer, 'weight')
elif l[0] == 'output_bias' or l[0] == 'beta':
pointer = getattr(pointer, 'bias')
elif l[0] == 'output_weights':
pointer = getattr(pointer, 'weight')
else:
pointer = getattr(pointer, l[0])
if len(l) >= 2:
num = int(l[1])
pointer = pointer[num]
if m_name[-11:] == '_embeddings':
pointer = getattr(pointer, 'weight')
elif m_name == 'kernel':
array = np.transpose(array) array = np.transpose(array)
if ('r_r_bias' in name or 'r_w_bias' in name) and len(pointer) > 1:
# Here we will split the TF weigths
assert len(pointer) == array.shape[0]
for i, p_i in enumerate(pointer):
arr_i = array[i, ...]
try:
assert p_i.shape == arr_i.shape
except AssertionError as e:
e.args += (p_i.shape, arr_i.shape)
raise
print("Initialize PyTorch weight {} for layer {}".format(name, i))
p_i.data = torch.from_numpy(arr_i)
continue
try: try:
assert pointer.shape == array.shape assert pointer.shape == array.shape
except AssertionError as e: except AssertionError as e:
...@@ -108,17 +166,16 @@ if __name__ == "__main__": ...@@ -108,17 +166,16 @@ if __name__ == "__main__":
type = str, type = str,
required = True, required = True,
help = "Path the TensorFlow checkpoint path.") help = "Path the TensorFlow checkpoint path.")
parser.add_argument("--transfo_xl_config_file",
default = None,
type = str,
required = True,
help = "The config json file corresponding to the pre-trained BERT model. \n"
"This specifies the model architecture.")
parser.add_argument("--pytorch_dump_folder_path", parser.add_argument("--pytorch_dump_folder_path",
default = None, default = None,
type = str, type = str,
required = True, required = True,
help = "Path to the output PyTorch model.") help = "Path to the output PyTorch model.")
parser.add_argument("--transfo_xl_config_file",
default = "",
type = str,
help = "The config json file corresponding to the pre-trained BERT model. \n"
"This specifies the model architecture.")
args = parser.parse_args() args = parser.parse_args()
convert_transfo_xl_checkpoint_to_pytorch(args.tf_checkpoint_path, convert_transfo_xl_checkpoint_to_pytorch(args.tf_checkpoint_path,
args.transfo_xl_config_file, args.transfo_xl_config_file,
......
...@@ -34,6 +34,7 @@ from torch.nn import CrossEntropyLoss ...@@ -34,6 +34,7 @@ from torch.nn import CrossEntropyLoss
from torch.nn.parameter import Parameter from torch.nn.parameter import Parameter
from .modeling import BertLayerNorm as LayerNorm from .modeling import BertLayerNorm as LayerNorm
from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits
from .file_utils import cached_path from .file_utils import cached_path
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -50,25 +51,26 @@ class TransfoXLConfig(object): ...@@ -50,25 +51,26 @@ class TransfoXLConfig(object):
def __init__(self, def __init__(self,
vocab_size_or_config_json_file=267735, vocab_size_or_config_json_file=267735,
cutoffs=[20000, 40000, 200000], cutoffs=[20000, 40000, 200000],
d_model=410, d_model=1024,
d_embed=410, d_embed=1024,
d_head=41, n_head=16,
d_inner=2100, d_head=64,
div_val=1.0, d_inner=4096,
div_val=4,
pre_lnorm=False, pre_lnorm=False,
n_layer=16, n_layer=18,
n_head=10, tgt_len=256,
tgt_len=150,
ext_len=0, ext_len=0,
mem_len=150, mem_len=256,
same_length=False, same_length=False,
attn_type=0, attn_type=0,
clamp_len=-1, clamp_len=-1,
sample_softmax=-1, sample_softmax=-1,
adaptive=True, adaptive=True,
tied=True, tie_weight=True,
dropout=0.1, dropout=0.1,
dropatt=0.0, dropatt=0.0,
untie_r=True,
init="normal", init="normal",
init_range=0.01, init_range=0.01,
proj_init_std=0.01, proj_init_std=0.01,
...@@ -95,10 +97,11 @@ class TransfoXLConfig(object): ...@@ -95,10 +97,11 @@ class TransfoXLConfig(object):
clamp_len: use the same pos embeddings after clamp_len clamp_len: use the same pos embeddings after clamp_len
sample_softmax: number of samples in sampled softmax sample_softmax: number of samples in sampled softmax
adaptive: use adaptive softmax adaptive: use adaptive softmax
tied: tie the word embedding and softmax weights tie_weight: tie the word embedding and softmax weights
dropout: The dropout probabilitiy for all fully connected dropout: The dropout probabilitiy for all fully connected
layers in the embeddings, encoder, and pooler. layers in the embeddings, encoder, and pooler.
dropatt: The dropout ratio for the attention probabilities. dropatt: The dropout ratio for the attention probabilities.
untie_r: untie relative position biases
embd_pdrop: The dropout ratio for the embeddings. embd_pdrop: The dropout ratio for the embeddings.
init: parameter initializer to use init: parameter initializer to use
init_range: parameters initialized by U(-init_range, init_range). init_range: parameters initialized by U(-init_range, init_range).
...@@ -111,9 +114,10 @@ class TransfoXLConfig(object): ...@@ -111,9 +114,10 @@ class TransfoXLConfig(object):
for key, value in json_config.items(): for key, value in json_config.items():
self.__dict__[key] = value self.__dict__[key] = value
elif isinstance(vocab_size_or_config_json_file, int): elif isinstance(vocab_size_or_config_json_file, int):
self.vocab_size = vocab_size_or_config_json_file self.n_token = vocab_size_or_config_json_file
self.cutoffs = [] self.cutoffs = []
self.cutoffs.extend(cutoffs) self.cutoffs.extend(cutoffs)
self.tie_weight = tie_weight
self.tie_projs = [False] + [True] * len(self.cutoffs) self.tie_projs = [False] + [True] * len(self.cutoffs)
self.d_model = d_model self.d_model = d_model
self.d_embed = d_embed self.d_embed = d_embed
...@@ -131,9 +135,9 @@ class TransfoXLConfig(object): ...@@ -131,9 +135,9 @@ class TransfoXLConfig(object):
self.clamp_len = clamp_len self.clamp_len = clamp_len
self.sample_softmax = sample_softmax self.sample_softmax = sample_softmax
self.adaptive = adaptive self.adaptive = adaptive
self.tied = tied
self.dropout = dropout self.dropout = dropout
self.dropatt = dropatt self.dropatt = dropatt
self.untie_r = untie_r
self.init = init self.init = init
self.init_range = init_range self.init_range = init_range
self.proj_init_std = proj_init_std self.proj_init_std = proj_init_std
...@@ -142,10 +146,6 @@ class TransfoXLConfig(object): ...@@ -142,10 +146,6 @@ class TransfoXLConfig(object):
raise ValueError("First argument must be either a vocabulary size (int)" raise ValueError("First argument must be either a vocabulary size (int)"
"or the path to a pretrained model config file (str)") "or the path to a pretrained model config file (str)")
@property
def total_num_embeddings(self):
return self.vocab_size + self.n_special + self.n_ctx
@classmethod @classmethod
def from_dict(cls, json_object): def from_dict(cls, json_object):
"""Constructs a `TransfoXLConfig` from a Python dictionary of parameters.""" """Constructs a `TransfoXLConfig` from a Python dictionary of parameters."""
...@@ -230,7 +230,7 @@ class PositionwiseFF(nn.Module): ...@@ -230,7 +230,7 @@ class PositionwiseFF(nn.Module):
class MultiHeadAttn(nn.Module): class MultiHeadAttn(nn.Module):
def __init__(self, n_head, d_model, d_head, dropout, dropatt=0, def __init__(self, n_head, d_model, d_head, dropout, dropatt=0,
pre_lnorm=False): pre_lnorm=False, r_r_bias=None, r_w_bias=None):
super(MultiHeadAttn, self).__init__() super(MultiHeadAttn, self).__init__()
self.n_head = n_head self.n_head = n_head
...@@ -251,6 +251,13 @@ class MultiHeadAttn(nn.Module): ...@@ -251,6 +251,13 @@ class MultiHeadAttn(nn.Module):
self.pre_lnorm = pre_lnorm self.pre_lnorm = pre_lnorm
if r_r_bias is None or r_w_bias is None: # Biases are not shared
self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
else:
self.r_r_bias = r_r_bias
self.r_w_bias = r_w_bias
def forward(self, h, attn_mask=None, mems=None): def forward(self, h, attn_mask=None, mems=None):
##### multihead attention ##### multihead attention
# [hlen x bsz x n_head x d_head] # [hlen x bsz x n_head x d_head]
...@@ -304,7 +311,8 @@ class MultiHeadAttn(nn.Module): ...@@ -304,7 +311,8 @@ class MultiHeadAttn(nn.Module):
class RelMultiHeadAttn(nn.Module): class RelMultiHeadAttn(nn.Module):
def __init__(self, n_head, d_model, d_head, dropout, dropatt=0, def __init__(self, n_head, d_model, d_head, dropout, dropatt=0,
tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False): tgt_len=None, ext_len=None, mem_len=None, pre_lnorm=False,
r_r_bias=None, r_w_bias=None):
super(RelMultiHeadAttn, self).__init__() super(RelMultiHeadAttn, self).__init__()
self.n_head = n_head self.n_head = n_head
...@@ -324,6 +332,13 @@ class RelMultiHeadAttn(nn.Module): ...@@ -324,6 +332,13 @@ class RelMultiHeadAttn(nn.Module):
self.pre_lnorm = pre_lnorm self.pre_lnorm = pre_lnorm
if r_r_bias is None or r_w_bias is None: # Biases are not shared
self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
else:
self.r_r_bias = r_r_bias
self.r_w_bias = r_w_bias
def _parallelogram_mask(self, h, w, left=False): def _parallelogram_mask(self, h, w, left=False):
mask = torch.ones((h, w)).byte() mask = torch.ones((h, w)).byte()
m = min(h, w) m = min(h, w)
...@@ -377,7 +392,7 @@ class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn): ...@@ -377,7 +392,7 @@ class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn):
self.r_net = nn.Linear(self.d_model, self.n_head * self.d_head, bias=False) self.r_net = nn.Linear(self.d_model, self.n_head * self.d_head, bias=False)
def forward(self, w, r, r_w_bias, r_r_bias, attn_mask=None, mems=None): def forward(self, w, r, attn_mask=None, mems=None):
qlen, rlen, bsz = w.size(0), r.size(0), w.size(1) qlen, rlen, bsz = w.size(0), r.size(0), w.size(1)
if mems is not None: if mems is not None:
...@@ -408,10 +423,10 @@ class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn): ...@@ -408,10 +423,10 @@ class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn):
r_head_k = r_head_k.view(rlen, self.n_head, self.d_head) # qlen x n_head x d_head r_head_k = r_head_k.view(rlen, self.n_head, self.d_head) # qlen x n_head x d_head
#### compute attention score #### compute attention score
rw_head_q = w_head_q + r_w_bias # qlen x bsz x n_head x d_head rw_head_q = w_head_q + self.r_w_bias # qlen x bsz x n_head x d_head
AC = torch.einsum('ibnd,jbnd->ijbn', (rw_head_q, w_head_k)) # qlen x klen x bsz x n_head AC = torch.einsum('ibnd,jbnd->ijbn', (rw_head_q, w_head_k)) # qlen x klen x bsz x n_head
rr_head_q = w_head_q + r_r_bias rr_head_q = w_head_q + self.r_r_bias
BD = torch.einsum('ibnd,jnd->ijbn', (rr_head_q, r_head_k)) # qlen x klen x bsz x n_head BD = torch.einsum('ibnd,jnd->ijbn', (rr_head_q, r_head_k)) # qlen x klen x bsz x n_head
BD = self._rel_shift(BD) BD = self._rel_shift(BD)
...@@ -582,9 +597,9 @@ class RelPartialLearnableDecoderLayer(nn.Module): ...@@ -582,9 +597,9 @@ class RelPartialLearnableDecoderLayer(nn.Module):
self.pos_ff = PositionwiseFF(d_model, d_inner, dropout, self.pos_ff = PositionwiseFF(d_model, d_inner, dropout,
pre_lnorm=kwargs.get('pre_lnorm')) pre_lnorm=kwargs.get('pre_lnorm'))
def forward(self, dec_inp, r, r_w_bias, r_r_bias, dec_attn_mask=None, mems=None): def forward(self, dec_inp, r, dec_attn_mask=None, mems=None):
output = self.dec_attn(dec_inp, r, r_w_bias, r_r_bias, output = self.dec_attn(dec_inp, r,
attn_mask=dec_attn_mask, attn_mask=dec_attn_mask,
mems=mems) mems=mems)
output = self.pos_ff(output) output = self.pos_ff(output)
...@@ -659,9 +674,9 @@ class MemTransformerLM(nn.Module): ...@@ -659,9 +674,9 @@ class MemTransformerLM(nn.Module):
dropout, dropatt, tie_weight=True, d_embed=None, dropout, dropatt, tie_weight=True, d_embed=None,
div_val=1, tie_projs=[False], pre_lnorm=False, div_val=1, tie_projs=[False], pre_lnorm=False,
tgt_len=None, ext_len=None, mem_len=None, tgt_len=None, ext_len=None, mem_len=None,
cutoffs=[], adapt_inp=False, cutoffs=[], adapt_inp=False, untie_r=False,
same_length=False, attn_type=0, clamp_len=-1, same_length=False, attn_type=0, clamp_len=-1,
sample_softmax=-1): sample_softmax=-1, **kwargs):
super(MemTransformerLM, self).__init__() super(MemTransformerLM, self).__init__()
self.n_token = n_token self.n_token = n_token
...@@ -685,6 +700,10 @@ class MemTransformerLM(nn.Module): ...@@ -685,6 +700,10 @@ class MemTransformerLM(nn.Module):
self.attn_type = attn_type self.attn_type = attn_type
if not untie_r:
self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
self.layers = nn.ModuleList() self.layers = nn.ModuleList()
if attn_type == 0: # the default attention if attn_type == 0: # the default attention
for i in range(n_layer): for i in range(n_layer):
...@@ -692,7 +711,9 @@ class MemTransformerLM(nn.Module): ...@@ -692,7 +711,9 @@ class MemTransformerLM(nn.Module):
RelPartialLearnableDecoderLayer( RelPartialLearnableDecoderLayer(
n_head, d_model, d_head, d_inner, dropout, n_head, d_model, d_head, d_inner, dropout,
tgt_len=tgt_len, ext_len=ext_len, mem_len=mem_len, tgt_len=tgt_len, ext_len=ext_len, mem_len=mem_len,
dropatt=dropatt, pre_lnorm=pre_lnorm) dropatt=dropatt, pre_lnorm=pre_lnorm,
r_w_bias=None if untie_r else self.r_w_bias,
r_r_bias=None if untie_r else self.r_r_bias)
) )
elif attn_type == 1: # learnable embeddings elif attn_type == 1: # learnable embeddings
for i in range(n_layer): for i in range(n_layer):
...@@ -700,14 +721,18 @@ class MemTransformerLM(nn.Module): ...@@ -700,14 +721,18 @@ class MemTransformerLM(nn.Module):
RelLearnableDecoderLayer( RelLearnableDecoderLayer(
n_head, d_model, d_head, d_inner, dropout, n_head, d_model, d_head, d_inner, dropout,
tgt_len=tgt_len, ext_len=ext_len, mem_len=mem_len, tgt_len=tgt_len, ext_len=ext_len, mem_len=mem_len,
dropatt=dropatt, pre_lnorm=pre_lnorm) dropatt=dropatt, pre_lnorm=pre_lnorm,
r_w_bias=None if untie_r else self.r_w_bias,
r_r_bias=None if untie_r else self.r_r_bias)
) )
elif attn_type in [2, 3]: # absolute embeddings elif attn_type in [2, 3]: # absolute embeddings
for i in range(n_layer): for i in range(n_layer):
self.layers.append( self.layers.append(
DecoderLayer( DecoderLayer(
n_head, d_model, d_head, d_inner, dropout, n_head, d_model, d_head, d_inner, dropout,
dropatt=dropatt, pre_lnorm=pre_lnorm) dropatt=dropatt, pre_lnorm=pre_lnorm,
r_w_bias=None if untie_r else self.r_w_bias,
r_r_bias=None if untie_r else self.r_r_bias)
) )
self.sample_softmax = sample_softmax self.sample_softmax = sample_softmax
...@@ -738,21 +763,11 @@ class MemTransformerLM(nn.Module): ...@@ -738,21 +763,11 @@ class MemTransformerLM(nn.Module):
self.same_length = same_length self.same_length = same_length
self.clamp_len = clamp_len self.clamp_len = clamp_len
self._create_params()
def backward_compatible(self):
self.sample_softmax = -1
def _create_params(self):
if self.attn_type == 0: # default attention if self.attn_type == 0: # default attention
self.pos_emb = PositionalEmbedding(self.d_model) self.pos_emb = PositionalEmbedding(self.d_model)
self.r_w_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
self.r_r_bias = nn.Parameter(torch.Tensor(self.n_head, self.d_head))
elif self.attn_type == 1: # learnable elif self.attn_type == 1: # learnable
self.r_emb = nn.Parameter(torch.Tensor( self.r_emb = nn.Parameter(torch.Tensor(
self.n_layer, self.max_klen, self.n_head, self.d_head)) self.n_layer, self.max_klen, self.n_head, self.d_head))
self.r_w_bias = nn.Parameter(torch.Tensor(
self.n_layer, self.n_head, self.d_head))
self.r_bias = nn.Parameter(torch.Tensor( self.r_bias = nn.Parameter(torch.Tensor(
self.n_layer, self.max_klen, self.n_head)) self.n_layer, self.max_klen, self.n_head))
elif self.attn_type == 2: # absolute standard elif self.attn_type == 2: # absolute standard
...@@ -761,6 +776,10 @@ class MemTransformerLM(nn.Module): ...@@ -761,6 +776,10 @@ class MemTransformerLM(nn.Module):
self.r_emb = nn.Parameter(torch.Tensor( self.r_emb = nn.Parameter(torch.Tensor(
self.n_layer, self.max_klen, self.n_head, self.d_head)) self.n_layer, self.max_klen, self.n_head, self.d_head))
def backward_compatible(self):
self.sample_softmax = -1
def reset_length(self, tgt_len, ext_len, mem_len): def reset_length(self, tgt_len, ext_len, mem_len):
self.tgt_len = tgt_len self.tgt_len = tgt_len
self.mem_len = mem_len self.mem_len = mem_len
...@@ -937,13 +956,13 @@ class TransfoXLPreTrainedModel(nn.Module): ...@@ -937,13 +956,13 @@ class TransfoXLPreTrainedModel(nn.Module):
)) ))
self.config = config self.config = config
def init_weight(weight): def init_weight(self, weight):
if self.config.init == 'uniform': if self.config.init == 'uniform':
nn.init.uniform_(weight, -self.config.init_range, self.config.init_range) nn.init.uniform_(weight, -self.config.init_range, self.config.init_range)
elif self.config.init == 'normal': elif self.config.init == 'normal':
nn.init.normal_(weight, 0.0, self.config.init_std) nn.init.normal_(weight, 0.0, self.config.init_std)
def init_bias(bias): def init_bias(self, bias):
nn.init.constant_(bias, 0.0) nn.init.constant_(bias, 0.0)
def init_weights(self, m): def init_weights(self, m):
...@@ -1100,89 +1119,11 @@ class TransfoXLPreTrainedModel(nn.Module): ...@@ -1100,89 +1119,11 @@ class TransfoXLPreTrainedModel(nn.Module):
return model return model
###################
class TransfoXLLMHead(nn.Module):
""" Language Model Head for the transformer """
def __init__(self, model_embeddings_weights, config):
super(TransfoXLLMHead, self).__init__()
self.n_embd = config.n_embd
self.set_embeddings_weights(model_embeddings_weights)
def set_embeddings_weights(self, model_embeddings_weights):
embed_shape = model_embeddings_weights.shape
self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
self.decoder.weight = model_embeddings_weights # Tied weights
def forward(self, hidden_state):
# Truncated Language modeling logits (we remove the last token)
# h_trunc = h[:, :-1].contiguous().view(-1, self.n_embd)
lm_logits = self.decoder(hidden_state)
return lm_logits
class TransfoXLMultipleChoiceHead(nn.Module):
""" Classifier Head for the transformer """
def __init__(self, config):
super(TransfoXLMultipleChoiceHead, self).__init__()
self.n_embd = config.n_embd
# self.multiple_choice_token = multiple_choice_token
self.dropout = nn.Dropout2d(config.resid_pdrop) # To reproduce the noise_shape parameter of TF implementation
self.linear = nn.Linear(config.n_embd, 1)
nn.init.normal_(self.linear.weight, std = 0.02)
nn.init.normal_(self.linear.bias, 0)
def forward(self, hidden_states, multiple_choice_token_mask):
# Classification logits
# hidden_states = hidden_states.view(-1, self.n_embd)
# multiple_choice_token_mask = multiple_choice_token_mask.view(-1, 1).expand_as(hidden_states)
multiple_choice_h = hidden_states * multiple_choice_token_mask.unsqueeze(-1)
multiple_choice_h = multiple_choice_h.sum(dim=-2)
# flat = x[..., 0].contiguous().view(-1)
# multiple_choice_h = multiple_choice_h[flat == self.multiple_choice_token, :]
# multiple_choice_h = multiple_choice_h.view(-1, x.size(1), self.n_embd, 1)
# # This double transposition is there to replicate the behavior
# # of the noise_shape argument in the tensorflow
# # implementation. For more details, see
# # https://github.com/huggingface/pytorch-openai-transformer-lm/issues/11
# multiple_choice_h = self.dropout(multiple_choice_h.transpose(1, 2)).transpose(1, 2)
# multiple_choice_h = multiple_choice_h.contiguous().view(-1, self.n_embd)
multiple_choice_logits = self.linear(multiple_choice_h).squeeze(-1)
return multiple_choice_logits
class TransfoXLModel(TransfoXLPreTrainedModel): class TransfoXLModel(TransfoXLPreTrainedModel):
"""OpenAI GPT model ("Improving Language Understanding by Generative Pre-Training"). """ Transformer XL model
From "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context"
The main implementation difference between BERT and the OpenAI is the use, in OpenAI GPT, of a single embedding matrix by Zihang Dai*, Zhilin Yang*, Yiming Yang, William W. Cohen, Jaime Carbonell,
to store the word, special ([SEP], [CLS]...) and position embeddings. Quoc V. Le, Ruslan Salakhutdinov (*: equal contribution)
The embeddings are ordered as follow in the word embeddings matrice:
[0, ----------------------
... -> word embeddings
config.vocab_size - 1, ______________________
config.vocab_size,
... -> special embeddings
config.vocab_size + config.n_special - 1, ______________________
config.vocab_size + config.n_special,
... -> position embeddings
total_num_embeddings - 1] ______________________
where total_num_embeddings can be obtained as config.total_num_embeddings and is:
total_num_embeddings = config.vocab_size + config.n_special + config.n_ctx
You should use the associate indices to index the embeddings.
The special embeddings ([SEP], [CLS]...) are not pre-trained and need to be trained during the fine-tuning if you use them.
The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
Params: Params:
config: a TransfoXLConfig class instance with the configuration to build a new model config: a TransfoXLConfig class instance with the configuration to build a new model
...@@ -1214,219 +1155,8 @@ class TransfoXLModel(TransfoXLPreTrainedModel): ...@@ -1214,219 +1155,8 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
""" """
def __init__(self, config): def __init__(self, config):
super(TransfoXLModel, self).__init__(config) super(TransfoXLModel, self).__init__(config)
total_embeddings_size = config.vocab_size + config.n_special + config.n_ctx self.transformer = MemTransformerLM(**config.to_dict())
self.embed = nn.Embedding(total_embeddings_size, config.n_embd)
self.drop = nn.Dropout(config.embd_pdrop)
block = Block(config.n_ctx, config, scale=True)
self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
self.apply(self.init_weights) self.apply(self.init_weights)
# nn.init.normal_(self.embed.weight, std=0.02)
def set_num_special_tokens(self, num_special_tokens):
" Update input embeddings with new embedding matrice "
# Update config
self.config.n_special = num_special_tokens
# # Build new embeddings and initialize
old_embed = self.embed
self.embed = nn.Embedding(self.config.total_num_embeddings, self.config.n_embd)
# Initialize all new embeddings (in particular the special tokens)
self.init_weights(self.embed)
# Copy word and positional embeddings from the previous weights
self.embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
self.embed.weight.data[-self.config.n_ctx:, :] = old_embed.weight.data[-self.config.n_ctx:, :]
def forward(self, input_ids, position_ids=None, token_type_ids=None): def forward(self, input_ids, position_ids=None, token_type_ids=None):
if position_ids is None: return self.transformer(input_ids, position_ids, token_type_ids)
start = self.config.vocab_size + self.config.n_special
end = start + input_ids.size(-1)
position_ids = torch.arange(start, end, dtype=torch.long, device=input_ids.device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
input_shape = input_ids.size()
input_ids = input_ids.view(-1, input_ids.size(-1))
position_ids = position_ids.view(-1, position_ids.size(-1))
inputs_embeds = self.embed(input_ids)
position_embeds = self.embed(position_ids)
if token_type_ids is not None:
token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
token_type_embeds = self.embed(token_type_ids)
else:
token_type_embeds = 0
# Add the position information to the input embeddings
# h = e.sum(dim=2)
hidden_states = inputs_embeds + position_embeds + token_type_embeds
for block in self.h:
hidden_states = block(hidden_states)
return hidden_states.view(*input_shape, hidden_states.size(-1))
class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
"""OpenAI GPT model with a Language Modeling head ("Improving Language Understanding by Generative Pre-Training").
There are two main implementation differences between BERT and the OpenAI GPT:
- the use of an LM loss in OpenAI GPT which means the Transformer is trained to predict the NEXT token for each input token
vs. predict the SAME token for BERT (i.e. you need to shift your labels to the right)
- the use, in OpenAI GPT, of a single embedding matrix to store the word, special ([SEP], [CLS]...) and position embeddings.
The embeddings are ordered as follow in the word embeddings matrice:
[0, ----------------------
... -> word embeddings
config.vocab_size - 1, ______________________
config.vocab_size,
... -> special embeddings
config.vocab_size + config.n_special - 1, ______________________
config.vocab_size + config.n_special,
... -> position embeddings
total_num_embeddings - 1] ______________________
where total_num_embeddings can be obtained as config.total_num_embeddings and is:
total_num_embeddings = config.vocab_size + config.n_special + config.n_ctx
You should use these indices to index the word, special and position embeddings.
The special embeddings ([SEP], [CLS]...) are not pre-trained and need to be trained during the fine-tuning if you use them.
The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
Params:
config: a TransfoXLConfig class instance with the configuration to build a new model
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
`position_ids`: an optional torch.LongTensor with the same shape as input_ids
with the position indices (selected in the range [config.vocab_size + config.n_special, config.vocab_size + config.n_special + config.n_ctx - 1[.
`token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
You can use it to add a third embedding (the previous two being the word and position embeddings)
to each token in the sentence.
`lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
is only computed for the labels set in [0, ..., vocab_size]
Outputs:
if `lm_labels` is not `None`:
Outputs the language modeling loss.
else:
`lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, total_num_embeddings]
(or more generally [d_1, ..., d_n, total_num_embeddings] were d_1 ... d_n are the dimension of input_ids)
Example usage:
```python
# Already been converted into BPE token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
config = modeling_transfo_xl.TransfoXLConfig()
model = modeling_transfo_xl.TransfoXLLMHeadModel(config)
lm_logits = model(input_ids)
```
"""
def __init__(self, config):
super(TransfoXLLMHeadModel, self).__init__(config)
self.transformer = TransfoXLModel(config)
self.lm_head = TransfoXLLMHead(self.transformer.embed.weight, config)
self.apply(self.init_weights)
def set_num_special_tokens(self, num_special_tokens):
" Update input and output embeddings with new embedding matrice "
self.transformer.set_num_special_tokens(num_special_tokens)
self.lm_head.set_embeddings_weights(self.transformer.embed.weight)
def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None):
hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
lm_logits = self.lm_head(hidden_states)
if lm_labels is not None:
loss_fct = CrossEntropyLoss(ignore_index=-1)
loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1))
return loss
return lm_logits
class TransfoXLDoubleHeadsModel(TransfoXLPreTrainedModel):
"""OpenAI GPT model with a Language Modeling and a Multiple Choice heads ("Improving Language Understanding by Generative Pre-Training").
There are two main implementation differences between BERT and the OpenAI GPT:
- the use of an LM loss in OpenAI GPT which means the Transformer is trained to predict the NEXT token for each input token
vs. predict the SAME token for BERT (i.e. you need to shift your labels to the right)
- the use, in OpenAI GPT, of a single embedding matrix to store the word, special ([SEP], [CLS]...) and position embeddings.
The embeddings are ordered as follow in the word embeddings matrice:
[0, ----------------------
... -> word embeddings
config.vocab_size - 1, ______________________
config.vocab_size,
... -> special embeddings
config.vocab_size + config.n_special - 1, ______________________
config.vocab_size + config.n_special,
... -> position embeddings
total_num_embeddings - 1] ______________________
where total_num_embeddings can be obtained as config.total_num_embeddings and is:
total_num_embeddings = config.vocab_size + config.n_special + config.n_ctx
You should use these indices to index the word, special and position embeddings.
The special embeddings ([SEP], [CLS]...) are not pre-trained and need to be trained during the fine-tuning if you use them.
The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
Params:
config: a TransfoXLConfig class instance with the configuration to build a new model
Inputs:
`input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
with the word BPE token indices selected in the range [0, config.vocab_size[
`multiple_choice_token_mask`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
with a value of 1 were the last hidden state is (usually the [CLS] token) and 0 otherwise.
`position_ids`: an optional torch.LongTensor with the same shape as input_ids
with the position indices (selected in the range [config.vocab_size + config.n_special,
config.vocab_size + config.n_special + config.n_ctx - 1[.
`token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
You can use it to add a third embedding (the previous two being the word and position embeddings)
to each token in the sentence.
`lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, num_choices, sequence_length]
with indices selected in [-1, 0, ..., total_num_embeddings]. All labels set to -1 are ignored (masked), the loss
is only computed for the labels set in [0, ..., total_num_embeddings]
`multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size]
with indices selected in [0, ..., num_choices].
Outputs:
if `lm_labels` and `multiple_choice_labels` are not `None`:
Outputs a tuple of losses with the language modeling loss and the multiple choice loss.
else: a tuple with
`lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, total_num_embeddings]
`multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices]
Example usage:
```python
# Already been converted into BPE token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
multiple_choice_token_mask = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = modeling_transfo_xl.TransfoXLConfig()
model = modeling_transfo_xl.TransfoXLLMHeadModel(config)
lm_logits, multiple_choice_logits = model(input_ids, multiple_choice_token_mask)
```
"""
def __init__(self, config):
super(TransfoXLDoubleHeadsModel, self).__init__(config)
self.transformer = TransfoXLModel(config)
self.lm_head = TransfoXLLMHead(self.transformer.embed.weight, config)
self.multiple_choice_head = TransfoXLMultipleChoiceHead(config)
self.apply(self.init_weights)
def set_num_special_tokens(self, num_special_tokens):
" Update input and output embeddings with new embedding matrice "
self.transformer.set_num_special_tokens(num_special_tokens)
self.lm_head.set_embeddings_weights(self.transformer.embed.weight)
def forward(self, input_ids, multiple_choice_token_mask, position_ids=None, token_type_ids=None,
lm_labels=None, multiple_choice_labels=None):
hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
lm_logits = self.lm_head(hidden_states)
multiple_choice_logits = self.multiple_choice_head(hidden_states, multiple_choice_token_mask)
losses = []
if lm_labels is not None:
loss_fct = CrossEntropyLoss(ignore_index=-1)
losses.append(loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1)))
if multiple_choice_labels is not None:
loss_fct = CrossEntropyLoss()
losses.append(loss_fct(multiple_choice_logits, multiple_choice_labels.view(-1)))
if losses:
return losses
return lm_logits, multiple_choice_logits
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment