Commit 5288913b authored by LysandreJik's avatar LysandreJik
Browse files

All TODOs to be checked by Thom have been added.

parent f773faa2
......@@ -483,7 +483,14 @@ class GPT2Model(GPT2PreTrainedModel):
self.apply(self.init_weights)
def set_num_special_tokens(self, num_special_tokens=None):
"""Update input embeddings with new embedding matrix if needed."""
"""
Update input embeddings with new embedding matrix if needed.
Args:
num_special_tokens: Special tokens to be added to the embedding matrix
TODO Lysandre filled args
"""
if num_special_tokens is None or self.config.n_special == num_special_tokens:
return
# Update config
......@@ -625,8 +632,6 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
This can be used to compute head importance metrics. Default: False
Example::
config = modeling_gpt2.GPT2Config()
......@@ -642,7 +647,13 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
"""
Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings.
TODO Shouldn't we put args + returns ?
Args:
num_special_tokens: Special tokens to be added to the embedding matrix
predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
Defaults to True.
TODO Lysandre filled args
"""
self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
self.transformer.set_num_special_tokens(num_special_tokens)
......@@ -737,7 +748,13 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
"""
Update input and output embeddings with new embedding matrix.Make sure we are sharing the embeddings
TODO Shouldn't we put args + returns ?
Args:
num_special_tokens: Special tokens to be added to the embedding matrix
predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
Defaults to True.
TODO Lysandre filled args
"""
self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
self.transformer.set_num_special_tokens(num_special_tokens)
......
......@@ -496,12 +496,10 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
"""
Update input embeddings with new embedding matrice if needed
TODO
Args:
num_special_tokens:
num_special_tokens: Special tokens to be added to the embedding matrix
Returns:
TODO Lysandre filled Args
"""
if num_special_tokens is None or self.config.n_special == num_special_tokens:
......@@ -665,7 +663,13 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
"""
Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings
TODO
Args:
num_special_tokens: Special tokens to be added to the embedding matrix
predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
Defaults to True.
TODO Lysandre filled Args
"""
self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
......@@ -775,9 +779,14 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
self.apply(self.init_weights)
def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
""" Update input and output embeddings with new embedding matrice
Make sure we are sharing the embeddings
TODO
""" Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings.
Args:
num_special_tokens: Special tokens to be added to the embedding matrix
predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
Defaults to True.
TODO Lysandre filled Args
"""
self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
self.transformer.set_num_special_tokens(num_special_tokens)
......
......@@ -623,7 +623,7 @@ class XLNetPreTrainedModel(PreTrainedModel):
class XLNetModel(XLNetPreTrainedModel):
"""XLNet model ("XLNet: Generalized Autoregressive Pretraining for Language Understanding").
TODO: this was copied from the XLNetLMHeadModel, check that it's ok.
TODO Lysandre filled: this was copied from the XLNetLMHeadModel, check that it's ok.
Args:
`config`: a XLNetConfig class instance with the configuration to build a new model
......@@ -631,7 +631,15 @@ class XLNetModel(XLNetPreTrainedModel):
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
This can be used to compute head importance metrics. Default: False
TODO: Add usage
Example::
config = modeling.XLNetConfig(vocab_size_or_config_json_file=32000, d_model=768,
n_layer=12, num_attention_heads=12, intermediate_size=3072)
model = modeling.XLNetModel(config=config)
TODO Lysandre filled: Added example usage
"""
def __init__(self, config):
super(XLNetModel, self).__init__(config)
......@@ -663,8 +671,8 @@ class XLNetModel(XLNetPreTrainedModel):
Creates causal attention mask. Float mask where 1.0 indicates masked, 0.0 indicates not-masked.
Args:
qlen: TODO
mlen: TODO
qlen: TODO Lysandre didn't fill
mlen: TODO Lysandre didn't fill
::
......@@ -783,19 +791,25 @@ class XLNetModel(XLNetPreTrainedModel):
1 for tokens with losses and 0 for tokens without losses.
Only used during pretraining for two-stream attention.
Set to None during finetuning.
head_mask: TODO Lysandre didn't fill
Returns:
TODO Lysandre didn't fill: Missing returns!
Example::
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
# or
all_encoder_layers, pooled_output = model.forward(input_ids, token_type_ids, input_mask)
TODO Lysandre filled: Filled with the LMHead example, is probably different since it has a different output
mem_len: int, the number of tokens to cache.
reuse_len: int, the number of tokens in the currect batch to be cached
and reused in the future.
bi_data: bool, whether to use bidirectional input pipeline.
Usually set to True during pretraining and False during finetuning.
clamp_len: int, clamp all relative distances larger than clamp_len.
-1 means no clamping.
same_length: bool, whether to use the same attention length for each token.
summary_type: str, "last", "first", "mean", or "attn". The method
to pool the input to get a vector representation.
TODO: Add usage
"""
# the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
# but we want a unified interface in the library with the batch size on the first dimension
......@@ -951,14 +965,14 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
This can be used to compute head importance metrics. Default: False
Example::
config = modeling.XLNetConfig(vocab_size_or_config_json_file=32000, d_model=768,
n_layer=12, num_attention_heads=12, intermediate_size=3072)
model = modeling.XLNetModel(config=config)
model = modeling.XLNetLMHeadModel(config=config)
TODO Lysandre modified: Changed XLNetModel to XLNetLMHeadModel in the example
"""
def __init__(self, config):
super(XLNetLMHeadModel, self).__init__(config)
......@@ -1122,7 +1136,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
1 for tokens with losses and 0 for tokens without losses.
Only used during pre-training for two-stream attention.
Set to None during fine-tuning.
labels: TODO
labels: TODO Lysandre didn't fill
head_mask: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment