fix transposition in model conversion and memory initialization

b9c77b98 · thomwolf · 009101de · b9c77b98 · b9c77b98
Commit b9c77b98 authored Jan 17, 2019 by thomwolf
2 changed files
--- a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
@@ -93,8 +93,6 @@ def build_tf_to_pytorch_map(model, config):

    # Relative positioning biases
    if config.untie_r:
-        layer_str = "transformer/r_r_bias"
-        layer_str_2 = "transformer/r_w_bias"
        r_r_list = []
        r_w_list = []
        for b in model.layers:
@@ -158,7 +156,7 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
            array = tf_weights[name]
            # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
            # which are not required for using pretrained model
-            if 'kernel' in name or 'proj_W' in name:
+            if 'kernel' in name or 'proj' in name:
                array = np.transpose(array)
            if ('r_r_bias' in name or 'r_w_bias' in name) and len(pointer) > 1:
                # Here we will split the TF weigths

--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
@@ -447,10 +447,10 @@ class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn):
        if attn_mask is not None and attn_mask.any().item():
            if attn_mask.dim() == 2:
                attn_score = attn_score.float().masked_fill(
-                    attn_mask[None,:,:,None], -float('inf')).type_as(attn_score)
+                    attn_mask[None,:,:,None], -1e30).type_as(attn_score)
            elif attn_mask.dim() == 3:
                attn_score = attn_score.float().masked_fill(
-                    attn_mask[:,:,:,None], -float('inf')).type_as(attn_score)
+                    attn_mask[:,:,:,None], -1e30).type_as(attn_score)

        # [qlen x klen x bsz x n_head]
        attn_prob = F.softmax(attn_score, dim=1)
@@ -947,12 +947,13 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
        self.mem_len = mem_len
        self.ext_len = ext_len

-    def init_mems(self):
+    def init_mems(self, data):
        if self.mem_len > 0:
            mems = []
            param = next(self.parameters())
            for i in range(self.n_layer+1):
-                empty = torch.empty(0, dtype=param.dtype, device=param.device)
+                empty = torch.zeros(self.mem_len, data.size(1), self.config.d_model,
+                                    dtype=param.dtype, device=param.device)
                mems.append(empty)

            return mems
@@ -1081,7 +1082,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
        # So, have to initialize size(0) mems inside the model forward.
        # Moreover, have to return new_mems to allow nn.DataParallel to piece
        # them together.
-        if not mems: mems = self.init_mems()
+        if not mems: mems = self.init_mems(data)

        hidden, new_mems = self._forward(data, mems=mems)
        if target is None: