model.decoder.bert.embeddings.token_type_embeddings.weight.data=torch.zeros_like(bertextabs.decoder.embeddings.weight)# not defined for BertExtAbs decoder
# In the original code the LayerNorms are applied twice in the layers, at the beginning and between the