Commit 60c984da authored by VictorSanh's avatar VictorSanh
Browse files

fix bugs

parent 42968138
...@@ -40,7 +40,8 @@ from .modeling_xlm import (XLMConfig, XLMPreTrainedModel , XLMModel, ...@@ -40,7 +40,8 @@ from .modeling_xlm import (XLMConfig, XLMPreTrainedModel , XLMModel,
XLM_PRETRAINED_MODEL_ARCHIVE_MAP) XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
from .modeling_roberta import (RobertaConfig, RobertaForMaskedLM, RobertaModel, RobertaForSequenceClassification, from .modeling_roberta import (RobertaConfig, RobertaForMaskedLM, RobertaModel, RobertaForSequenceClassification,
ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP) ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
from .modeling_dilbert import (DilBertconfig, DilBertForMaskedLM, DilBertModel, DilBertForSequenceClassification, from .modeling_dilbert import (DilBertConfig, DilBertForMaskedLM, DilBertModel,
DilBertForSequenceClassification, DilBertForQuestionAnswering,
DILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DILBERT_PRETRAINED_MODEL_ARCHIVE_MAP) DILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME, from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME,
PretrainedConfig, PreTrainedModel, prune_layer, Conv1D) PretrainedConfig, PreTrainedModel, prune_layer, Conv1D)
......
...@@ -45,7 +45,7 @@ DILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { ...@@ -45,7 +45,7 @@ DILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
} }
class DilBertconfig(PretrainedConfig): class DilBertConfig(PretrainedConfig):
pretrained_config_archive_map = DILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP pretrained_config_archive_map = DILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
def __init__(self, def __init__(self,
...@@ -62,7 +62,7 @@ class DilBertconfig(PretrainedConfig): ...@@ -62,7 +62,7 @@ class DilBertconfig(PretrainedConfig):
initializer_range=0.02, initializer_range=0.02,
tie_weights=True, tie_weights=True,
**kwargs): **kwargs):
super(DilBertconfig, self).__init__(**kwargs) super(DilBertConfig, self).__init__(**kwargs)
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
and isinstance(vocab_size_or_config_json_file, unicode)): and isinstance(vocab_size_or_config_json_file, unicode)):
...@@ -77,6 +77,7 @@ class DilBertconfig(PretrainedConfig): ...@@ -77,6 +77,7 @@ class DilBertconfig(PretrainedConfig):
self.n_layers = n_layers self.n_layers = n_layers
self.n_heads = n_heads self.n_heads = n_heads
self.dim = dim self.dim = dim
self.hidden_dim = hidden_dim
self.dropout = dropout self.dropout = dropout
self.attention_dropout = attention_dropout self.attention_dropout = attention_dropout
self.activation = activation self.activation = activation
...@@ -341,7 +342,7 @@ class DilBertPreTrainedModel(PreTrainedModel): ...@@ -341,7 +342,7 @@ class DilBertPreTrainedModel(PreTrainedModel):
""" An abstract class to handle weights initialization and """ An abstract class to handle weights initialization and
a simple interface for downloading and loading pretrained models. a simple interface for downloading and loading pretrained models.
""" """
config_class = DilBertconfig config_class = DilBertConfig
pretrained_model_archive_map = DILBERT_PRETRAINED_MODEL_ARCHIVE_MAP pretrained_model_archive_map = DILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
load_tf_weights = None load_tf_weights = None
base_model_prefix = "dilbert" base_model_prefix = "dilbert"
...@@ -370,7 +371,7 @@ DILBERT_START_DOCSTRING = r""" ...@@ -370,7 +371,7 @@ DILBERT_START_DOCSTRING = r"""
For more information on DilBERT, you should check TODO(Victor): Link to Medium For more information on DilBERT, you should check TODO(Victor): Link to Medium
Parameters: Parameters:
config (:class:`~pytorch_transformers.DilBertconfig`): Model configuration class with all the parameters of the model. config (:class:`~pytorch_transformers.DilBertConfig`): Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration. Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights. Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
""" """
...@@ -391,18 +392,7 @@ DILBERT_INPUTS_DOCSTRING = r""" ...@@ -391,18 +392,7 @@ DILBERT_INPUTS_DOCSTRING = r"""
@add_start_docstrings("The bare DilBERT encoder/transformer outputing raw hidden-states without any specific head on top.", @add_start_docstrings("The bare DilBERT encoder/transformer outputing raw hidden-states without any specific head on top.",
DILBERT_START_DOCSTRING, DILBERT_INPUTS_DOCSTRING) DILBERT_START_DOCSTRING, DILBERT_INPUTS_DOCSTRING)
class DilBertModel(DilBertPreTrainedModel): class DilBertModel(DilBertPreTrainedModel):
def __init__(self, config): r"""
super(DilBertModel, self).__init__(config)
self.embeddings = Embeddings(config) # Embeddings
self.transformer = Transformer(config) # Encoder
self.apply(self.init_weights)
def forward(self,
input_ids: torch.tensor,
attention_mask: torch.tensor = None):
"""
Parameters Parameters
---------- ----------
input_ids: torch.tensor(bs, seq_length) input_ids: torch.tensor(bs, seq_length)
...@@ -422,7 +412,18 @@ class DilBertModel(DilBertPreTrainedModel): ...@@ -422,7 +412,18 @@ class DilBertModel(DilBertPreTrainedModel):
all_attentions: Tuple[torch.tensor(bs, n_heads, seq_length, seq_length)] all_attentions: Tuple[torch.tensor(bs, n_heads, seq_length, seq_length)]
Tuple of length n_layers with the attention weights from each layer Tuple of length n_layers with the attention weights from each layer
Optional: only if output_attentions=True Optional: only if output_attentions=True
""" """
def __init__(self, config):
super(DilBertModel, self).__init__(config)
self.embeddings = Embeddings(config) # Embeddings
self.transformer = Transformer(config) # Encoder
self.apply(self.init_weights)
def forward(self,
input_ids: torch.tensor,
attention_mask: torch.tensor = None):
if attention_mask is None: if attention_mask is None:
attention_mask = torch.ones_like(input_ids) # (bs, seq_length) attention_mask = torch.ones_like(input_ids) # (bs, seq_length)
...@@ -438,6 +439,30 @@ class DilBertModel(DilBertPreTrainedModel): ...@@ -438,6 +439,30 @@ class DilBertModel(DilBertPreTrainedModel):
@add_start_docstrings("""DilBert Model with a `masked language modeling` head on top. """, @add_start_docstrings("""DilBert Model with a `masked language modeling` head on top. """,
DILBERT_START_DOCSTRING, DILBERT_INPUTS_DOCSTRING) DILBERT_START_DOCSTRING, DILBERT_INPUTS_DOCSTRING)
class DilBertForMaskedLM(DilBertPreTrainedModel): class DilBertForMaskedLM(DilBertPreTrainedModel):
r"""
Parameters
----------
input_ids: torch.tensor(bs, seq_length)
Token ids.
attention_mask: torch.tensor(bs, seq_length)
Attention mask. Optional: If None, it's like there was no padding.
masked_lm_labels: torch.tensor(bs, seq_length)
The masked language modeling labels. Optional: If None, no loss is computed.
Outputs
-------
mlm_loss: torch.tensor(1,)
Masked Language Modeling loss to optimize.
Optional: only if `masked_lm_labels` is not None
prediction_logits: torch.tensor(bs, seq_length, voc_size)
Token prediction logits
all_hidden_states: Tuple[torch.tensor(bs, seq_length, dim)]
Tuple of length n_layers with the hidden states from each layer.
Optional: only if `output_hidden_states`=True
all_attentions: Tuple[torch.tensor(bs, n_heads, seq_length, seq_length)]
Tuple of length n_layers with the attention weights from each layer
Optional: only if `output_attentions`=True
"""
def __init__(self, config): def __init__(self, config):
super(DilBertForMaskedLM, self).__init__(config) super(DilBertForMaskedLM, self).__init__(config)
self.output_attentions = config.output_attentions self.output_attentions = config.output_attentions
...@@ -449,7 +474,7 @@ class DilBertForMaskedLM(DilBertPreTrainedModel): ...@@ -449,7 +474,7 @@ class DilBertForMaskedLM(DilBertPreTrainedModel):
self.vocab_projector = nn.Linear(config.dim, config.vocab_size) self.vocab_projector = nn.Linear(config.dim, config.vocab_size)
self.apply(self.init_weights) self.apply(self.init_weights)
self.tie_weights() self.tie_weights_()
self.mlm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1) self.mlm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
...@@ -464,30 +489,6 @@ class DilBertForMaskedLM(DilBertPreTrainedModel): ...@@ -464,30 +489,6 @@ class DilBertForMaskedLM(DilBertPreTrainedModel):
input_ids: torch.tensor, input_ids: torch.tensor,
attention_mask: torch.tensor = None, attention_mask: torch.tensor = None,
masked_lm_labels: torch.tensor = None): masked_lm_labels: torch.tensor = None):
"""
Parameters
----------
input_ids: torch.tensor(bs, seq_length)
Token ids.
attention_mask: torch.tensor(bs, seq_length)
Attention mask. Optional: If None, it's like there was no padding.
masked_lm_labels: torch.tensor(bs, seq_length)
The masked language modeling labels. Optional: If None, no loss is computed.
Outputs
-------
mlm_loss: torch.tensor(1,)
Masked Language Modeling loss to optimize.
Optional: only if `masked_lm_labels` is not None
prediction_logits: torch.tensor(bs, seq_length, voc_size)
Token prediction logits
all_hidden_states: Tuple[torch.tensor(bs, seq_length, dim)]
Tuple of length n_layers with the hidden states from each layer.
Optional: only if `output_hidden_states`=True
all_attentions: Tuple[torch.tensor(bs, n_heads, seq_length, seq_length)]
Tuple of length n_layers with the attention weights from each layer
Optional: only if `output_attentions`=True
"""
tfmr_output = self.encoder(input_ids=input_ids, tfmr_output = self.encoder(input_ids=input_ids,
attention_mask=attention_mask) attention_mask=attention_mask)
hidden_states = tfmr_output[0] # (bs, seq_length, dim) hidden_states = tfmr_output[0] # (bs, seq_length, dim)
...@@ -508,22 +509,7 @@ class DilBertForMaskedLM(DilBertPreTrainedModel): ...@@ -508,22 +509,7 @@ class DilBertForMaskedLM(DilBertPreTrainedModel):
the pooled output) e.g. for GLUE tasks. """, the pooled output) e.g. for GLUE tasks. """,
DILBERT_START_DOCSTRING, DILBERT_INPUTS_DOCSTRING) DILBERT_START_DOCSTRING, DILBERT_INPUTS_DOCSTRING)
class DilBertForSequenceClassification(DilBertPreTrainedModel): class DilBertForSequenceClassification(DilBertPreTrainedModel):
def __init__(self, config): r"""
super(DilBertForSequenceClassification, self).__init__(config)
self.num_labels = config.num_labels
self.dilbert = DilBertModel(config)
self.pre_classifier = nn.Linear(config.dim, config.dim)
self.classifier = nn.Linear(config.dim, config.num_labels)
self.dropout = nn.Dropout(config.seq_classif_dropout)
self.apply(self.init_weights)
def forward(self,
input_ids: torch.tensor,
attention_mask: torch.tensor = None,
labels: torch.tensor = None):
"""
Parameters Parameters
---------- ----------
input_ids: torch.tensor(bs, seq_length) input_ids: torch.tensor(bs, seq_length)
...@@ -546,7 +532,22 @@ class DilBertForSequenceClassification(DilBertPreTrainedModel): ...@@ -546,7 +532,22 @@ class DilBertForSequenceClassification(DilBertPreTrainedModel):
all_attentions: Tuple[torch.tensor(bs, n_heads, seq_length, seq_length)] all_attentions: Tuple[torch.tensor(bs, n_heads, seq_length, seq_length)]
Tuple of length n_layers with the attention weights from each layer Tuple of length n_layers with the attention weights from each layer
Optional: only if `output_attentions`=True Optional: only if `output_attentions`=True
""" """
def __init__(self, config):
super(DilBertForSequenceClassification, self).__init__(config)
self.num_labels = config.num_labels
self.dilbert = DilBertModel(config)
self.pre_classifier = nn.Linear(config.dim, config.dim)
self.classifier = nn.Linear(config.dim, config.num_labels)
self.dropout = nn.Dropout(config.seq_classif_dropout)
self.apply(self.init_weights)
def forward(self,
input_ids: torch.tensor,
attention_mask: torch.tensor = None,
labels: torch.tensor = None):
dilbert_output = self.dilbert(input_ids=input_ids, dilbert_output = self.dilbert(input_ids=input_ids,
attention_mask=attention_mask) attention_mask=attention_mask)
pooled_output = dilbert_output[1] # (bs, dim) pooled_output = dilbert_output[1] # (bs, dim)
...@@ -571,22 +572,7 @@ class DilBertForSequenceClassification(DilBertPreTrainedModel): ...@@ -571,22 +572,7 @@ class DilBertForSequenceClassification(DilBertPreTrainedModel):
the hidden-states output to compute `span start logits` and `span end logits`). """, the hidden-states output to compute `span start logits` and `span end logits`). """,
DILBERT_START_DOCSTRING, DILBERT_INPUTS_DOCSTRING) DILBERT_START_DOCSTRING, DILBERT_INPUTS_DOCSTRING)
class DilBertForQuestionAnswering(DilBertPreTrainedModel): class DilBertForQuestionAnswering(DilBertPreTrainedModel):
def __init__(self, config): r"""
super(DilBertForQuestionAnswering, self).__init__(config)
self.dilbert = DilBertModel(config)
self.qa_outputs = nn.Linear(config.dim, config.num_labels)
assert config.num_labels == 2
self.dropout = nn.Dropout(config.qa_dropout)
self.apply(self.init_weights)
def forward(self,
input_ids: torch.tensor,
attention_mask: torch.tensor = None,
start_positions: torch.tensor = None,
end_positions: torch.tensor = None):
"""
Parameters Parameters
---------- ----------
input_ids: torch.tensor(bs, seq_length) input_ids: torch.tensor(bs, seq_length)
...@@ -619,7 +605,22 @@ class DilBertForQuestionAnswering(DilBertPreTrainedModel): ...@@ -619,7 +605,22 @@ class DilBertForQuestionAnswering(DilBertPreTrainedModel):
all_attentions: Tuple[torch.tensor(bs, n_heads, seq_length, seq_length)] all_attentions: Tuple[torch.tensor(bs, n_heads, seq_length, seq_length)]
Tuple of length n_layers with the attention weights from each layer Tuple of length n_layers with the attention weights from each layer
Optional: only if `output_attentions`=True Optional: only if `output_attentions`=True
""" """
def __init__(self, config):
super(DilBertForQuestionAnswering, self).__init__(config)
self.dilbert = DilBertModel(config)
self.qa_outputs = nn.Linear(config.dim, config.num_labels)
assert config.num_labels == 2
self.dropout = nn.Dropout(config.qa_dropout)
self.apply(self.init_weights)
def forward(self,
input_ids: torch.tensor,
attention_mask: torch.tensor = None,
start_positions: torch.tensor = None,
end_positions: torch.tensor = None):
dilbert_output = self.dilbert(input_ids=input_ids, dilbert_output = self.dilbert(input_ids=input_ids,
attention_mask=attention_mask) attention_mask=attention_mask)
hidden_states = dilbert_output[0] # (bs, max_query_len, dim) hidden_states = dilbert_output[0] # (bs, max_query_len, dim)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment