"tests/models/vscode:/vscode.git/clone" did not exist on "a72f1c9f5b907f96cbb7de3bbb02a1d431d34071"
Commit a60ae1a5 authored by LysandreJik's avatar LysandreJik
Browse files

Docstrings best practice shown in the BERT documentation.

parent 64fd9863
...@@ -150,27 +150,11 @@ ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish} ...@@ -150,27 +150,11 @@ ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
class BertConfig(PretrainedConfig): class BertConfig(PretrainedConfig):
"""Configuration class to store the configuration of a `BertModel`. r"""
""" :class:`~pytorch_pretrained_bert.BertConfig` is the configuration class to store the configuration of a
pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP `BertModel`.
def __init__(self,
vocab_size_or_config_json_file=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
**kwargs):
"""Constructs BertConfig.
Args: Arguments:
vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`. vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
hidden_size: Size of the encoder layers and the pooler layer. hidden_size: Size of the encoder layers and the pooler layer.
num_hidden_layers: Number of hidden layers in the Transformer encoder. num_hidden_layers: Number of hidden layers in the Transformer encoder.
...@@ -193,6 +177,24 @@ class BertConfig(PretrainedConfig): ...@@ -193,6 +177,24 @@ class BertConfig(PretrainedConfig):
initializing all weight matrices. initializing all weight matrices.
layer_norm_eps: The epsilon used by LayerNorm. layer_norm_eps: The epsilon used by LayerNorm.
""" """
pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
def __init__(self,
vocab_size_or_config_json_file=30522,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
layer_norm_eps=1e-12,
**kwargs):
"""Constructs BertConfig.
"""
super(BertConfig, self).__init__(**kwargs) super(BertConfig, self).__init__(**kwargs)
if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
and isinstance(vocab_size_or_config_json_file, unicode)): and isinstance(vocab_size_or_config_json_file, unicode)):
...@@ -707,12 +709,32 @@ class BertForPreTraining(BertPreTrainedModel): ...@@ -707,12 +709,32 @@ class BertForPreTraining(BertPreTrainedModel):
- the masked language modeling head, and - the masked language modeling head, and
- the next sentence classification head. - the next sentence classification head.
Params: Args:
`config`: a BertConfig class instance with the configuration to build a new model `config`: a BertConfig class instance with the configuration to build a new model
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
`output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
Inputs: Example ::
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = BertForPreTraining(config)
"""
def __init__(self, config):
super(BertForPreTraining, self).__init__(config)
self.bert = BertModel(config)
self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
self.apply(self.init_weights)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
next_sentence_label=None, head_mask=None):
"""
Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
Args:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`) `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
...@@ -732,17 +754,20 @@ class BertForPreTraining(BertPreTrainedModel): ...@@ -732,17 +754,20 @@ class BertForPreTraining(BertPreTrainedModel):
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked. It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Outputs:
if `masked_lm_labels` and `next_sentence_label` are not `None`: Returns:
Outputs the total_loss which is the sum of the masked language modeling loss and the next Either a torch.Tensor or tuple(torch.Tensor, torch.Tensor).
if ``masked_lm_labels`` and ``next_sentence_label`` are not ``None``, outputs the total_loss which is the \
sum of the masked language modeling loss and the next \
sentence classification loss. sentence classification loss.
if `masked_lm_labels` or `next_sentence_label` is `None`:
Outputs a tuple comprising if ``masked_lm_labels`` or ``next_sentence_label` is `None``, outputs a tuple comprising:
- the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
- the next sentence classification logits of shape [batch_size, 2]. - the next sentence classification logits of shape [batch_size, 2].
Example usage: Example ::
```python
# Already been converted into WordPiece token ids # Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
...@@ -753,18 +778,9 @@ class BertForPreTraining(BertPreTrainedModel): ...@@ -753,18 +778,9 @@ class BertForPreTraining(BertPreTrainedModel):
model = BertForPreTraining(config) model = BertForPreTraining(config)
masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask) masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
``` # or
masked_lm_logits_scores, seq_relationship_logits = model.forward(input_ids, token_type_ids, input_mask)
""" """
def __init__(self, config):
super(BertForPreTraining, self).__init__(config)
self.bert = BertModel(config)
self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
self.apply(self.init_weights)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
next_sentence_label=None, head_mask=None):
outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask) outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
sequence_output, pooled_output = outputs[:2] sequence_output, pooled_output = outputs[:2]
...@@ -786,12 +802,31 @@ class BertForMaskedLM(BertPreTrainedModel): ...@@ -786,12 +802,31 @@ class BertForMaskedLM(BertPreTrainedModel):
"""BERT model with the masked language modeling head. """BERT model with the masked language modeling head.
This module comprises the BERT model followed by the masked language modeling head. This module comprises the BERT model followed by the masked language modeling head.
Params: Args:
`config`: a BertConfig class instance with the configuration to build a new model `config`: a BertConfig class instance with the configuration to build a new model
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
`output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
Inputs: Example::
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = BertForMaskedLM(config)
"""
def __init__(self, config):
super(BertForMaskedLM, self).__init__(config)
self.bert = BertModel(config)
self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
self.apply(self.init_weights)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
"""
Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
Args:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`) `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
...@@ -812,35 +847,21 @@ class BertForMaskedLM(BertPreTrainedModel): ...@@ -812,35 +847,21 @@ class BertForMaskedLM(BertPreTrainedModel):
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked. It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Outputs: Returns:
if `masked_lm_labels` is not `None`: Masked language modeling loss if `masked_lm_labels` is specified, masked language modeling
Outputs the masked language modeling loss. logits of shape [batch_size, sequence_length, vocab_size] otherwise.
if `masked_lm_labels` is `None`:
Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size]. Example::
Example usage:
```python
# Already been converted into WordPiece token ids # Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = BertForMaskedLM(config)
masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask) masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
``` # or
masked_lm_logits_scores = model.forward(input_ids, token_type_ids, input_mask)
""" """
def __init__(self, config):
super(BertForMaskedLM, self).__init__(config)
self.bert = BertModel(config)
self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
self.apply(self.init_weights)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask) outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
sequence_output = outputs[0] sequence_output = outputs[0]
...@@ -859,14 +880,33 @@ class BertForNextSentencePrediction(BertPreTrainedModel): ...@@ -859,14 +880,33 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
"""BERT model with next sentence prediction head. """BERT model with next sentence prediction head.
This module comprises the BERT model followed by the next sentence classification head. This module comprises the BERT model followed by the next sentence classification head.
Params: Args:
`config`: a BertConfig class instance with the configuration to build a new model `config`: a BertConfig class instance with the configuration to build a new model
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
`output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
Inputs: Example::
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = BertForNextSentencePrediction(config)
"""
def __init__(self, config):
super(BertForNextSentencePrediction, self).__init__(config)
self.bert = BertModel(config)
self.cls = BertOnlyNSPHead(config)
self.apply(self.init_weights)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None, head_mask=None):
"""
Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
Args:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts with the word token indices in the vocabulary(see the tokens pre-processing logic in the scripts
`run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`) `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
...@@ -878,39 +918,27 @@ class BertForNextSentencePrediction(BertPreTrainedModel): ...@@ -878,39 +918,27 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
`next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size] `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
with indices selected in [0, 1]. with indices selected in [0, 1].
0 => next sentence is the continuation, 1 => next sentence is a random sentence. 0 => next sentence is the continuation, 1 => next sentence is a random sentence.
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked. 0 and 1.It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked,
0.0 => head is not masked.
Outputs: Returns:
if `next_sentence_label` is not `None`: If `next_sentence_label` is specified, outputs the total_loss which is the sum of the masked language \
Outputs the total_loss which is the sum of the masked language modeling loss and the next modeling loss and the next sentence classification loss.
sentence classification loss. if `next_sentence_label` is `None`, outputs the next sentence classification logits of shape [batch_size, 2].
if `next_sentence_label` is `None`:
Outputs the next sentence classification logits of shape [batch_size, 2].
Example::
Example usage:
```python
# Already been converted into WordPiece token ids # Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = BertForNextSentencePrediction(config)
seq_relationship_logits = model(input_ids, token_type_ids, input_mask) seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
``` # or
seq_relationship_logits = model.forward(input_ids, token_type_ids, input_mask)
""" """
def __init__(self, config):
super(BertForNextSentencePrediction, self).__init__(config)
self.bert = BertModel(config)
self.cls = BertOnlyNSPHead(config)
self.apply(self.init_weights)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None, head_mask=None):
outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask) outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
pooled_output = outputs[1] pooled_output = outputs[1]
...@@ -936,7 +964,30 @@ class BertForSequenceClassification(BertPreTrainedModel): ...@@ -936,7 +964,30 @@ class BertForSequenceClassification(BertPreTrainedModel):
`output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
`num_labels`: the number of classes for the classifier. Default = 2. `num_labels`: the number of classes for the classifier. Default = 2.
Inputs: Example::
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
num_labels = 2
model = BertForSequenceClassification(config, num_labels)
"""
def __init__(self, config):
super(BertForSequenceClassification, self).__init__(config)
self.num_labels = config.num_labels
self.bert = BertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
self.apply(self.init_weights)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
"""
Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
Parameters:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary. Items in the batch should begin with the special "CLS" token. (see the tokens preprocessing logic in the scripts with the word token indices in the vocabulary. Items in the batch should begin with the special "CLS" token. (see the tokens preprocessing logic in the scripts
`run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`) `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
...@@ -952,39 +1003,21 @@ class BertForSequenceClassification(BertPreTrainedModel): ...@@ -952,39 +1003,21 @@ class BertForSequenceClassification(BertPreTrainedModel):
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked. It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Outputs: Returns:
if `labels` is not `None`: if `labels` is not `None`, outputs the CrossEntropy classification loss of the output with the labels.
Outputs the CrossEntropy classification loss of the output with the labels. if `labels` is `None`, outputs the classification logits of shape `[batch_size, num_labels]`.
if `labels` is `None`:
Outputs the classification logits of shape [batch_size, num_labels]. Example::
Example usage:
```python
# Already been converted into WordPiece token ids # Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
num_labels = 2
model = BertForSequenceClassification(config, num_labels)
logits = model(input_ids, token_type_ids, input_mask) logits = model(input_ids, token_type_ids, input_mask)
``` # or
logits = model.forward(input_ids, token_type_ids, input_mask)
""" """
def __init__(self, config):
super(BertForSequenceClassification, self).__init__(config)
self.num_labels = config.num_labels
self.bert = BertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
self.apply(self.init_weights)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask) outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
pooled_output = outputs[1] pooled_output = outputs[1]
...@@ -1008,15 +1041,39 @@ class BertForSequenceClassification(BertPreTrainedModel): ...@@ -1008,15 +1041,39 @@ class BertForSequenceClassification(BertPreTrainedModel):
class BertForMultipleChoice(BertPreTrainedModel): class BertForMultipleChoice(BertPreTrainedModel):
"""BERT model for multiple choice tasks. """BERT model for multiple choice tasks.
This module is composed of the BERT model with a linear layer on top of This module is composed of the BERT model with a linear layer on top of the pooled output.
the pooled output.
Params: Parameters:
`config`: a BertConfig class instance with the configuration to build a new model `config`: a BertConfig class instance with the configuration to build a new model
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
`output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
Inputs: Example::
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = BertForMultipleChoice(config)
logits = model(input_ids, token_type_ids, input_mask)
"""
def __init__(self, config):
super(BertForMultipleChoice, self).__init__(config)
self.bert = BertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, 1)
self.apply(self.init_weights)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
"""
Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
Parameters:
`input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`) `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
...@@ -1032,14 +1089,12 @@ class BertForMultipleChoice(BertPreTrainedModel): ...@@ -1032,14 +1089,12 @@ class BertForMultipleChoice(BertPreTrainedModel):
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked. It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Outputs: Returns:
if `labels` is not `None`: if `labels` is not `None`, outputs the CrossEntropy classification loss of the output with the labels.
Outputs the CrossEntropy classification loss of the output with the labels. if `labels` is `None`, outputs the classification logits of shape [batch_size, num_labels].
if `labels` is `None`:
Outputs the classification logits of shape [batch_size, num_labels]. Example::
Example usage:
```python
# Already been converted into WordPiece token ids # Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]]) input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]]) input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
...@@ -1049,18 +1104,7 @@ class BertForMultipleChoice(BertPreTrainedModel): ...@@ -1049,18 +1104,7 @@ class BertForMultipleChoice(BertPreTrainedModel):
model = BertForMultipleChoice(config) model = BertForMultipleChoice(config)
logits = model(input_ids, token_type_ids, input_mask) logits = model(input_ids, token_type_ids, input_mask)
```
""" """
def __init__(self, config):
super(BertForMultipleChoice, self).__init__(config)
self.bert = BertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, 1)
self.apply(self.init_weights)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
""" Input shapes should be [bsz, num choices, seq length] """ """ Input shapes should be [bsz, num choices, seq length] """
num_choices = input_ids.shape[1] num_choices = input_ids.shape[1]
...@@ -1089,15 +1133,38 @@ class BertForTokenClassification(BertPreTrainedModel): ...@@ -1089,15 +1133,38 @@ class BertForTokenClassification(BertPreTrainedModel):
This module is composed of the BERT model with a linear layer on top of This module is composed of the BERT model with a linear layer on top of
the full hidden state of the last layer. the full hidden state of the last layer.
Params: Parameters:
`config`: a BertConfig class instance with the configuration to build a new model `config`: a BertConfig class instance with the configuration to build a new model
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
`output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
`num_labels`: the number of classes for the classifier. Default = 2. `num_labels`: the number of classes for the classifier. Default = 2.
Inputs: Example::
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
num_labels = 2
model = BertForTokenClassification(config, num_labels)
"""
def __init__(self, config):
super(BertForTokenClassification, self).__init__(config)
self.num_labels = config.num_labels
self.bert = BertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.apply(self.init_weights)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
"""
Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
Parameters:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts with the word token indices in the vocabulary(see the tokens pre-processing logic in the scripts
`run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`) `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
`token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
...@@ -1111,39 +1178,21 @@ class BertForTokenClassification(BertPreTrainedModel): ...@@ -1111,39 +1178,21 @@ class BertForTokenClassification(BertPreTrainedModel):
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked. It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Outputs: Returns:
if `labels` is not `None`: if `labels` is not `None`, outputs the CrossEntropy classification loss of the output with the labels.
Outputs the CrossEntropy classification loss of the output with the labels. if `labels` is `None`, outputs the classification logits of shape [batch_size, sequence_length, num_labels].
if `labels` is `None`:
Outputs the classification logits of shape [batch_size, sequence_length, num_labels]. Example::
Example usage:
```python
# Already been converted into WordPiece token ids # Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
num_labels = 2
model = BertForTokenClassification(config, num_labels)
logits = model(input_ids, token_type_ids, input_mask) logits = model(input_ids, token_type_ids, input_mask)
``` # or
logits = model.forward(input_ids, token_type_ids, input_mask)
""" """
def __init__(self, config):
super(BertForTokenClassification, self).__init__(config)
self.num_labels = config.num_labels
self.bert = BertModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
self.apply(self.init_weights)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask) outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
sequence_output = outputs[0] sequence_output = outputs[0]
...@@ -1171,12 +1220,31 @@ class BertForQuestionAnswering(BertPreTrainedModel): ...@@ -1171,12 +1220,31 @@ class BertForQuestionAnswering(BertPreTrainedModel):
This module is composed of the BERT model with a linear layer on top of This module is composed of the BERT model with a linear layer on top of
the sequence output that computes start_logits and end_logits the sequence output that computes start_logits and end_logits
Params: Parameters:
`config`: a BertConfig class instance with the configuration to build a new model `config`: a BertConfig class instance with the configuration to build a new model
`output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
`output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
Inputs: Example::
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = BertForQuestionAnswering(config)
"""
def __init__(self, config):
super(BertForQuestionAnswering, self).__init__(config)
self.num_labels = config.num_labels
self.bert = BertModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.apply(self.init_weights)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,
end_positions=None, head_mask=None):
"""
Parameters:
`input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
`run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`) `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
...@@ -1196,38 +1264,21 @@ class BertForQuestionAnswering(BertPreTrainedModel): ...@@ -1196,38 +1264,21 @@ class BertForQuestionAnswering(BertPreTrainedModel):
`head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked. It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
Outputs: Returns:
if `start_positions` and `end_positions` are not `None`: if `start_positions` and `end_positions` are not `None`, outputs the total_loss which is the sum of the
Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions. CrossEntropy loss for the start and end token positions.
if `start_positions` or `end_positions` is `None`: if `start_positions` or `end_positions` is `None`, outputs a tuple of start_logits, end_logits which are the
Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end logits respectively for the start and end position tokens of shape [batch_size, sequence_length].
position tokens of shape [batch_size, sequence_length].
Example::
Example usage:
```python
# Already been converted into WordPiece token ids # Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]]) input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]]) input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]]) token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
model = BertForQuestionAnswering(config)
start_logits, end_logits = model(input_ids, token_type_ids, input_mask) start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
```
""" """
def __init__(self, config):
super(BertForQuestionAnswering, self).__init__(config)
self.num_labels = config.num_labels
self.bert = BertModel(config)
self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
self.apply(self.init_weights)
def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,
end_positions=None, head_mask=None):
outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask) outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
sequence_output = outputs[0] sequence_output = outputs[0]
......
...@@ -182,7 +182,8 @@ SCHEDULES = { ...@@ -182,7 +182,8 @@ SCHEDULES = {
class BertAdam(Optimizer): class BertAdam(Optimizer):
"""Implements BERT version of Adam algorithm with weight decay fix. """Implements BERT version of Adam algorithm with weight decay fix.
Params:
Parameters:
lr: learning rate lr: learning rate
warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1 warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1
t_total: total number of training steps for the learning t_total: total number of training steps for the learning
......
...@@ -84,24 +84,22 @@ def whitespace_tokenize(text): ...@@ -84,24 +84,22 @@ def whitespace_tokenize(text):
class BertTokenizer(object): class BertTokenizer(object):
"""Runs end-to-end tokenization: punctuation splitting + wordpiece""" r"""
Constructs a BertTokenizer.
def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True, :class:`~pytorch_pretrained_bert.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece
never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
"""Constructs a BertTokenizer.
Args: Args:
vocab_file: Path to a one-wordpiece-per-line vocabulary file vocab_file: Path to a one-wordpiece-per-line vocabulary file
do_lower_case: Whether to lower case the input do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
Only has an effect when do_wordpiece_only=False
do_basic_tokenize: Whether to do basic tokenization before wordpiece. do_basic_tokenize: Whether to do basic tokenization before wordpiece.
max_len: An artificial maximum length to truncate tokenized sequences to; max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
Effective maximum length is always the minimum of this minimum of this value (if specified) and the underlying BERT model's sequence length.
value (if specified) and the underlying BERT model's never_split: List of tokens which will never be split during tokenization. Only has an effect when
sequence length. do_wordpiece_only=False
never_split: List of tokens which will never be split during tokenization.
Only has an effect when do_wordpiece_only=False
""" """
def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True,
never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
if not os.path.isfile(vocab_file): if not os.path.isfile(vocab_file):
raise ValueError( raise ValueError(
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment