Docstrings best practice shown in the BERT documentation.

a60ae1a5 · LysandreJik · 64fd9863 · a60ae1a5 · a60ae1a5 · a60ae1a5
Commit a60ae1a5 authored Jul 08, 2019 by LysandreJik
3 changed files
--- a/pytorch_pretrained_bert/modeling_bert.py
+++ b/pytorch_pretrained_bert/modeling_bert.py
@@ -150,27 +150,11 @@ ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}


 class BertConfig(PretrainedConfig):
-    """Configuration class to store the configuration of a `BertModel`.
-    """
-    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
-
-    def __init__(self,
-                 vocab_size_or_config_json_file=30522,
-                 hidden_size=768,
-                 num_hidden_layers=12,
-                 num_attention_heads=12,
-                 intermediate_size=3072,
-                 hidden_act="gelu",
-                 hidden_dropout_prob=0.1,
-                 attention_probs_dropout_prob=0.1,
-                 max_position_embeddings=512,
-                 type_vocab_size=2,
-                 initializer_range=0.02,
-                 layer_norm_eps=1e-12,
-                 **kwargs):
-        """Constructs BertConfig.
+    r"""
+        :class:`~pytorch_pretrained_bert.BertConfig` is the configuration class to store the configuration of a
+        `BertModel`.

-        Args:
+        Arguments:
            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
            hidden_size: Size of the encoder layers and the pooler layer.
            num_hidden_layers: Number of hidden layers in the Transformer encoder.
@@ -193,6 +177,24 @@ class BertConfig(PretrainedConfig):
                initializing all weight matrices.
            layer_norm_eps: The epsilon used by LayerNorm.
    """
+    pretrained_config_archive_map = PRETRAINED_CONFIG_ARCHIVE_MAP
+
+    def __init__(self,
+                 vocab_size_or_config_json_file=30522,
+                 hidden_size=768,
+                 num_hidden_layers=12,
+                 num_attention_heads=12,
+                 intermediate_size=3072,
+                 hidden_act="gelu",
+                 hidden_dropout_prob=0.1,
+                 attention_probs_dropout_prob=0.1,
+                 max_position_embeddings=512,
+                 type_vocab_size=2,
+                 initializer_range=0.02,
+                 layer_norm_eps=1e-12,
+                 **kwargs):
+        """Constructs BertConfig.
+        """
        super(BertConfig, self).__init__(**kwargs)
        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
                        and isinstance(vocab_size_or_config_json_file, unicode)):
@@ -707,12 +709,32 @@ class BertForPreTraining(BertPreTrainedModel):
        - the masked language modeling head, and
        - the next sentence classification head.

-    Params:
+    Args:
        `config`: a BertConfig class instance with the configuration to build a new model
        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
        `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False

-    Inputs:
+    Example ::
+
+        config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+        model = BertForPreTraining(config)
+    """
+    def __init__(self, config):
+        super(BertForPreTraining, self).__init__(config)
+
+        self.bert = BertModel(config)
+        self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
+
+        self.apply(self.init_weights)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
+                next_sentence_label=None, head_mask=None):
+        """
+        Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
+
+        Args:
            `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
                with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
@@ -732,17 +754,20 @@ class BertForPreTraining(BertPreTrainedModel):
            `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.

-    Outputs:
-        if `masked_lm_labels` and `next_sentence_label` are not `None`:
-            Outputs the total_loss which is the sum of the masked language modeling loss and the next
+
+        Returns:
+            Either a torch.Tensor or tuple(torch.Tensor, torch.Tensor).
+
+            if ``masked_lm_labels`` and ``next_sentence_label`` are not ``None``, outputs the total_loss which is the \
+             sum of the masked language modeling loss and the next \
            sentence classification loss.
-        if `masked_lm_labels` or `next_sentence_label` is `None`:
-            Outputs a tuple comprising
+
+            if ``masked_lm_labels`` or ``next_sentence_label` is `None``, outputs a tuple comprising:
                - the masked language modeling logits of shape [batch_size, sequence_length, vocab_size], and
                - the next sentence classification logits of shape [batch_size, 2].

-    Example usage:
-    ```python
+        Example ::
+
            # Already been converted into WordPiece token ids
            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
@@ -753,18 +778,9 @@ class BertForPreTraining(BertPreTrainedModel):

            model = BertForPreTraining(config)
            masked_lm_logits_scores, seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
-    ```
+            # or
+            masked_lm_logits_scores, seq_relationship_logits = model.forward(input_ids, token_type_ids, input_mask)
        """
-    def __init__(self, config):
-        super(BertForPreTraining, self).__init__(config)
-
-        self.bert = BertModel(config)
-        self.cls = BertPreTrainingHeads(config, self.bert.embeddings.word_embeddings.weight)
-
-        self.apply(self.init_weights)
-
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None,
-                next_sentence_label=None, head_mask=None):
        outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)

        sequence_output, pooled_output = outputs[:2]
@@ -786,12 +802,31 @@ class BertForMaskedLM(BertPreTrainedModel):
    """BERT model with the masked language modeling head.
    This module comprises the BERT model followed by the masked language modeling head.

-    Params:
+    Args:
        `config`: a BertConfig class instance with the configuration to build a new model
        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
        `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False

-    Inputs:
+    Example::
+
+        config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+        model = BertForMaskedLM(config)
+    """
+    def __init__(self, config):
+        super(BertForMaskedLM, self).__init__(config)
+
+        self.bert = BertModel(config)
+        self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
+
+        self.apply(self.init_weights)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
+        """
+        Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
+
+        Args:
            `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
                with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
@@ -812,35 +847,21 @@ class BertForMaskedLM(BertPreTrainedModel):
            `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.

-    Outputs:
-        if `masked_lm_labels` is  not `None`:
-            Outputs the masked language modeling loss.
-        if `masked_lm_labels` is `None`:
-            Outputs the masked language modeling logits of shape [batch_size, sequence_length, vocab_size].
+        Returns:
+            Masked language modeling loss if `masked_lm_labels` is specified, masked language modeling
+            logits of shape [batch_size, sequence_length, vocab_size] otherwise.
+
+        Example::

-    Example usage:
-    ```python
            # Already been converted into WordPiece token ids
            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = BertForMaskedLM(config)
            masked_lm_logits_scores = model(input_ids, token_type_ids, input_mask)
-    ```
+            # or
+            masked_lm_logits_scores = model.forward(input_ids, token_type_ids, input_mask)
        """
-    def __init__(self, config):
-        super(BertForMaskedLM, self).__init__(config)
-
-        self.bert = BertModel(config)
-        self.cls = BertOnlyMLMHead(config, self.bert.embeddings.word_embeddings.weight)
-
-        self.apply(self.init_weights)
-
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, head_mask=None):
        outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)

        sequence_output = outputs[0]
@@ -859,14 +880,33 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
    """BERT model with next sentence prediction head.
    This module comprises the BERT model followed by the next sentence classification head.

-    Params:
+    Args:
        `config`: a BertConfig class instance with the configuration to build a new model
        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
        `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False

-    Inputs:
+    Example::
+
+        config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+        model = BertForNextSentencePrediction(config)
+    """
+    def __init__(self, config):
+        super(BertForNextSentencePrediction, self).__init__(config)
+
+        self.bert = BertModel(config)
+        self.cls = BertOnlyNSPHead(config)
+
+        self.apply(self.init_weights)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None, head_mask=None):
+        """
+        Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
+
+        Args:
            `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+                with the word token indices in the vocabulary(see the tokens pre-processing logic in the scripts
                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
            `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
                types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
@@ -878,39 +918,27 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
            `next_sentence_label`: next sentence classification loss: torch.LongTensor of shape [batch_size]
                with indices selected in [0, 1].
                0 => next sentence is the continuation, 1 => next sentence is a random sentence.
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
+            `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between
+                0 and 1.It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked,
+                0.0 => head is not masked.

-    Outputs:
-        if `next_sentence_label` is not `None`:
-            Outputs the total_loss which is the sum of the masked language modeling loss and the next
-            sentence classification loss.
-        if `next_sentence_label` is `None`:
-            Outputs the next sentence classification logits of shape [batch_size, 2].
+        Returns:
+            If `next_sentence_label` is specified, outputs the total_loss which is the sum of the masked language \
+            modeling loss and the next sentence classification loss.
+            if `next_sentence_label` is `None`, outputs the next sentence classification logits of shape [batch_size, 2].
+
+
+        Example::

-    Example usage:
-    ```python
            # Already been converted into WordPiece token ids
            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = BertForNextSentencePrediction(config)
            seq_relationship_logits = model(input_ids, token_type_ids, input_mask)
-    ```
+            # or
+            seq_relationship_logits = model.forward(input_ids, token_type_ids, input_mask)
        """
-    def __init__(self, config):
-        super(BertForNextSentencePrediction, self).__init__(config)
-
-        self.bert = BertModel(config)
-        self.cls = BertOnlyNSPHead(config)
-
-        self.apply(self.init_weights)
-
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, next_sentence_label=None, head_mask=None):
        outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
        pooled_output = outputs[1]

@@ -936,7 +964,30 @@ class BertForSequenceClassification(BertPreTrainedModel):
        `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
        `num_labels`: the number of classes for the classifier. Default = 2.

-    Inputs:
+    Example::
+
+        config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+        num_labels = 2
+
+        model = BertForSequenceClassification(config, num_labels)
+    """
+    def __init__(self, config):
+        super(BertForSequenceClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
+
+        self.apply(self.init_weights)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
+        """
+        Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
+
+        Parameters:
            `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
                with the word token indices in the vocabulary. Items in the batch should begin with the special "CLS" token. (see the tokens preprocessing logic in the scripts
                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
@@ -952,39 +1003,21 @@ class BertForSequenceClassification(BertPreTrainedModel):
            `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.

-    Outputs:
-        if `labels` is not `None`:
-            Outputs the CrossEntropy classification loss of the output with the labels.
-        if `labels` is `None`:
-            Outputs the classification logits of shape [batch_size, num_labels].
+        Returns:
+            if `labels` is not `None`, outputs the CrossEntropy classification loss of the output with the labels.
+            if `labels` is `None`, outputs the classification logits of shape `[batch_size, num_labels]`.
+
+        Example::

-    Example usage:
-    ```python
            # Already been converted into WordPiece token ids
            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    num_labels = 2
-
-    model = BertForSequenceClassification(config, num_labels)
            logits = model(input_ids, token_type_ids, input_mask)
-    ```
+            # or
+            logits = model.forward(input_ids, token_type_ids, input_mask)
        """
-    def __init__(self, config):
-        super(BertForSequenceClassification, self).__init__(config)
-        self.num_labels = config.num_labels
-
-        self.bert = BertModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
-
-        self.apply(self.init_weights)
-
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
        outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
        pooled_output = outputs[1]

@@ -1008,15 +1041,39 @@ class BertForSequenceClassification(BertPreTrainedModel):

 class BertForMultipleChoice(BertPreTrainedModel):
    """BERT model for multiple choice tasks.
-    This module is composed of the BERT model with a linear layer on top of
-    the pooled output.
+    This module is composed of the BERT model with a linear layer on top of the pooled output.

-    Params:
+    Parameters:
        `config`: a BertConfig class instance with the configuration to build a new model
        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
        `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False

-    Inputs:
+    Example::
+
+        # Already been converted into WordPiece token ids
+        input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
+        input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
+        token_type_ids = torch.LongTensor([[[0, 0, 1], [0, 1, 0]],[[0, 1, 1], [0, 0, 1]]])
+        config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+        model = BertForMultipleChoice(config)
+        logits = model(input_ids, token_type_ids, input_mask)
+    """
+    def __init__(self, config):
+        super(BertForMultipleChoice, self).__init__(config)
+
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        self.apply(self.init_weights)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
+        """
+        Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
+
+        Parameters:
            `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length]
                with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
@@ -1032,14 +1089,12 @@ class BertForMultipleChoice(BertPreTrainedModel):
            `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.

-    Outputs:
-        if `labels` is not `None`:
-            Outputs the CrossEntropy classification loss of the output with the labels.
-        if `labels` is `None`:
-            Outputs the classification logits of shape [batch_size, num_labels].
+        Returns:
+            if `labels` is not `None`, outputs the CrossEntropy classification loss of the output with the labels.
+            if `labels` is `None`, outputs the classification logits of shape [batch_size, num_labels].
+
+        Example::

-    Example usage:
-    ```python
            # Already been converted into WordPiece token ids
            input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]], [[12, 16, 42], [14, 28, 57]]])
            input_mask = torch.LongTensor([[[1, 1, 1], [1, 1, 0]],[[1,1,0], [1, 0, 0]]])
@@ -1049,18 +1104,7 @@ class BertForMultipleChoice(BertPreTrainedModel):

            model = BertForMultipleChoice(config)
            logits = model(input_ids, token_type_ids, input_mask)
-    ```
        """
-    def __init__(self, config):
-        super(BertForMultipleChoice, self).__init__(config)
-
-        self.bert = BertModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, 1)
-
-        self.apply(self.init_weights)
-
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
        """ Input shapes should be [bsz, num choices, seq length] """
        num_choices = input_ids.shape[1]

@@ -1089,15 +1133,38 @@ class BertForTokenClassification(BertPreTrainedModel):
    This module is composed of the BERT model with a linear layer on top of
    the full hidden state of the last layer.

-    Params:
+    Parameters:
        `config`: a BertConfig class instance with the configuration to build a new model
        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
        `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False
        `num_labels`: the number of classes for the classifier. Default = 2.

-    Inputs:
+    Example::
+
+        config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+        num_labels = 2
+
+        model = BertForTokenClassification(config, num_labels)
+    """
+    def __init__(self, config):
+        super(BertForTokenClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.apply(self.init_weights)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
+        """
+        Performs a model forward pass. Can be called by calling the class directly, once it has been instantiated.
+
+        Parameters:
            `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
-            with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
+                with the word token indices in the vocabulary(see the tokens pre-processing logic in the scripts
                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
            `token_type_ids`: an optional torch.LongTensor of shape [batch_size, sequence_length] with the token
                types indices selected in [0, 1]. Type 0 corresponds to a `sentence A` and type 1 corresponds to
@@ -1111,39 +1178,21 @@ class BertForTokenClassification(BertPreTrainedModel):
            `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.

-    Outputs:
-        if `labels` is not `None`:
-            Outputs the CrossEntropy classification loss of the output with the labels.
-        if `labels` is `None`:
-            Outputs the classification logits of shape [batch_size, sequence_length, num_labels].
+        Returns:
+            if `labels` is not `None`, outputs the CrossEntropy classification loss of the output with the labels.
+            if `labels` is `None`, outputs the classification logits of shape [batch_size, sequence_length, num_labels].
+
+        Example::

-    Example usage:
-    ```python
            # Already been converted into WordPiece token ids
            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    num_labels = 2
-
-    model = BertForTokenClassification(config, num_labels)
            logits = model(input_ids, token_type_ids, input_mask)
-    ```
+            # or
+            logits = model.forward(input_ids, token_type_ids, input_mask)
        """
-    def __init__(self, config):
-        super(BertForTokenClassification, self).__init__(config)
-        self.num_labels = config.num_labels
-
-        self.bert = BertModel(config)
-        self.dropout = nn.Dropout(config.hidden_dropout_prob)
-        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.apply(self.init_weights)
-
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None, head_mask=None):
        outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
        sequence_output = outputs[0]

@@ -1171,12 +1220,31 @@ class BertForQuestionAnswering(BertPreTrainedModel):
    This module is composed of the BERT model with a linear layer on top of
    the sequence output that computes start_logits and end_logits

-    Params:
+    Parameters:
        `config`: a BertConfig class instance with the configuration to build a new model
        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
        `output_hidden_states`: If True, also output hidden states computed by the model at each layer. Default: False

-    Inputs:
+    Example::
+
+        config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+            num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
+
+        model = BertForQuestionAnswering(config)
+    """
+    def __init__(self, config):
+        super(BertForQuestionAnswering, self).__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = BertModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.apply(self.init_weights)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,
+                end_positions=None, head_mask=None):
+        """
+        Parameters:
            `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length]
                with the word token indices in the vocabulary(see the tokens preprocessing logic in the scripts
                `run_bert_extract_features.py`, `run_bert_classifier.py` and `run_bert_squad.py`)
@@ -1196,38 +1264,21 @@ class BertForQuestionAnswering(BertPreTrainedModel):
            `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
                It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.

-    Outputs:
-        if `start_positions` and `end_positions` are not `None`:
-            Outputs the total_loss which is the sum of the CrossEntropy loss for the start and end token positions.
-        if `start_positions` or `end_positions` is `None`:
-            Outputs a tuple of start_logits, end_logits which are the logits respectively for the start and end
-            position tokens of shape [batch_size, sequence_length].
+        Returns:
+            if `start_positions` and `end_positions` are not `None`, outputs the total_loss which is the sum of the
+            CrossEntropy loss for the start and end token positions.
+            if `start_positions` or `end_positions` is `None`, outputs a tuple of start_logits, end_logits which are the
+            logits respectively for the start and end position tokens of shape [batch_size, sequence_length].
+
+        Example::

-    Example usage:
-    ```python
            # Already been converted into WordPiece token ids
            input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
            input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
            token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)
-
-    model = BertForQuestionAnswering(config)
            start_logits, end_logits = model(input_ids, token_type_ids, input_mask)
-    ```
        """
-    def __init__(self, config):
-        super(BertForQuestionAnswering, self).__init__(config)
-        self.num_labels = config.num_labels
-
-        self.bert = BertModel(config)
-        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
-
-        self.apply(self.init_weights)
-
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, start_positions=None,
-                end_positions=None, head_mask=None):
        outputs = self.bert(input_ids, token_type_ids, attention_mask, head_mask=head_mask)
        sequence_output = outputs[0]


--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -182,7 +182,8 @@ SCHEDULES = {

 class BertAdam(Optimizer):
    """Implements BERT version of Adam algorithm with weight decay fix.
-    Params:
+
+    Parameters:
        lr: learning rate
        warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
        t_total: total number of training steps for the learning

--- a/pytorch_pretrained_bert/tokenization_bert.py
+++ b/pytorch_pretrained_bert/tokenization_bert.py
@@ -84,24 +84,22 @@ def whitespace_tokenize(text):


 class BertTokenizer(object):
-    """Runs end-to-end tokenization: punctuation splitting + wordpiece"""
-
-    def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True,
-                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
-        """Constructs a BertTokenizer.
+    r"""
+    Constructs a BertTokenizer.
+    :class:`~pytorch_pretrained_bert.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece

    Args:
        vocab_file: Path to a one-wordpiece-per-line vocabulary file
-          do_lower_case: Whether to lower case the input
-                         Only has an effect when do_wordpiece_only=False
+        do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
        do_basic_tokenize: Whether to do basic tokenization before wordpiece.
-          max_len: An artificial maximum length to truncate tokenized sequences to;
-                         Effective maximum length is always the minimum of this
-                         value (if specified) and the underlying BERT model's
-                         sequence length.
-          never_split: List of tokens which will never be split during tokenization.
-                         Only has an effect when do_wordpiece_only=False
+        max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
+            minimum of this value (if specified) and the underlying BERT model's sequence length.
+        never_split: List of tokens which will never be split during tokenization. Only has an effect when
+            do_wordpiece_only=False
    """
+
+    def __init__(self, vocab_file, do_lower_case=True, max_len=None, do_basic_tokenize=True,
+                 never_split=("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")):
        if not os.path.isfile(vocab_file):
            raise ValueError(
                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "