"ppocr/git@developer.sourcefind.cn:wangsen/paddle_dbnet.git" did not exist on "b6cd60476c7a065841d61a247ab3cd4264622c30"
Commit b9c77b98 authored by thomwolf's avatar thomwolf
Browse files

fix transposition in model conversion and memory initialization

parent 009101de
...@@ -93,8 +93,6 @@ def build_tf_to_pytorch_map(model, config): ...@@ -93,8 +93,6 @@ def build_tf_to_pytorch_map(model, config):
# Relative positioning biases # Relative positioning biases
if config.untie_r: if config.untie_r:
layer_str = "transformer/r_r_bias"
layer_str_2 = "transformer/r_w_bias"
r_r_list = [] r_r_list = []
r_w_list = [] r_w_list = []
for b in model.layers: for b in model.layers:
...@@ -158,7 +156,7 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path, ...@@ -158,7 +156,7 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
array = tf_weights[name] array = tf_weights[name]
# adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
# which are not required for using pretrained model # which are not required for using pretrained model
if 'kernel' in name or 'proj_W' in name: if 'kernel' in name or 'proj' in name:
array = np.transpose(array) array = np.transpose(array)
if ('r_r_bias' in name or 'r_w_bias' in name) and len(pointer) > 1: if ('r_r_bias' in name or 'r_w_bias' in name) and len(pointer) > 1:
# Here we will split the TF weigths # Here we will split the TF weigths
......
...@@ -447,10 +447,10 @@ class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn): ...@@ -447,10 +447,10 @@ class RelPartialLearnableMultiHeadAttn(RelMultiHeadAttn):
if attn_mask is not None and attn_mask.any().item(): if attn_mask is not None and attn_mask.any().item():
if attn_mask.dim() == 2: if attn_mask.dim() == 2:
attn_score = attn_score.float().masked_fill( attn_score = attn_score.float().masked_fill(
attn_mask[None,:,:,None], -float('inf')).type_as(attn_score) attn_mask[None,:,:,None], -1e30).type_as(attn_score)
elif attn_mask.dim() == 3: elif attn_mask.dim() == 3:
attn_score = attn_score.float().masked_fill( attn_score = attn_score.float().masked_fill(
attn_mask[:,:,:,None], -float('inf')).type_as(attn_score) attn_mask[:,:,:,None], -1e30).type_as(attn_score)
# [qlen x klen x bsz x n_head] # [qlen x klen x bsz x n_head]
attn_prob = F.softmax(attn_score, dim=1) attn_prob = F.softmax(attn_score, dim=1)
...@@ -947,12 +947,13 @@ class TransfoXLModel(TransfoXLPreTrainedModel): ...@@ -947,12 +947,13 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
self.mem_len = mem_len self.mem_len = mem_len
self.ext_len = ext_len self.ext_len = ext_len
def init_mems(self): def init_mems(self, data):
if self.mem_len > 0: if self.mem_len > 0:
mems = [] mems = []
param = next(self.parameters()) param = next(self.parameters())
for i in range(self.n_layer+1): for i in range(self.n_layer+1):
empty = torch.empty(0, dtype=param.dtype, device=param.device) empty = torch.zeros(self.mem_len, data.size(1), self.config.d_model,
dtype=param.dtype, device=param.device)
mems.append(empty) mems.append(empty)
return mems return mems
...@@ -1081,7 +1082,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel): ...@@ -1081,7 +1082,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
# So, have to initialize size(0) mems inside the model forward. # So, have to initialize size(0) mems inside the model forward.
# Moreover, have to return new_mems to allow nn.DataParallel to piece # Moreover, have to return new_mems to allow nn.DataParallel to piece
# them together. # them together.
if not mems: mems = self.init_mems() if not mems: mems = self.init_mems(data)
hidden, new_mems = self._forward(data, mems=mems) hidden, new_mems = self._forward(data, mems=mems)
if target is None: if target is None:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment