"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "e3ef62bce150e9200b70d46d3abbc094364330eb"
Commit 5288913b authored by LysandreJik's avatar LysandreJik
Browse files

All TODOs to be checked by Thom have been added.

parent f773faa2
...@@ -483,7 +483,14 @@ class GPT2Model(GPT2PreTrainedModel): ...@@ -483,7 +483,14 @@ class GPT2Model(GPT2PreTrainedModel):
self.apply(self.init_weights) self.apply(self.init_weights)
def set_num_special_tokens(self, num_special_tokens=None): def set_num_special_tokens(self, num_special_tokens=None):
"""Update input embeddings with new embedding matrix if needed.""" """
Update input embeddings with new embedding matrix if needed.
Args:
num_special_tokens: Special tokens to be added to the embedding matrix
TODO Lysandre filled args
"""
if num_special_tokens is None or self.config.n_special == num_special_tokens: if num_special_tokens is None or self.config.n_special == num_special_tokens:
return return
# Update config # Update config
...@@ -625,8 +632,6 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): ...@@ -625,8 +632,6 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
This can be used to compute head importance metrics. Default: False This can be used to compute head importance metrics. Default: False
Example:: Example::
config = modeling_gpt2.GPT2Config() config = modeling_gpt2.GPT2Config()
...@@ -642,7 +647,13 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): ...@@ -642,7 +647,13 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True): def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
""" """
Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings. Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings.
TODO Shouldn't we put args + returns ?
Args:
num_special_tokens: Special tokens to be added to the embedding matrix
predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
Defaults to True.
TODO Lysandre filled args
""" """
self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
self.transformer.set_num_special_tokens(num_special_tokens) self.transformer.set_num_special_tokens(num_special_tokens)
...@@ -737,7 +748,13 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): ...@@ -737,7 +748,13 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True): def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
""" """
Update input and output embeddings with new embedding matrix.Make sure we are sharing the embeddings Update input and output embeddings with new embedding matrix.Make sure we are sharing the embeddings
TODO Shouldn't we put args + returns ?
Args:
num_special_tokens: Special tokens to be added to the embedding matrix
predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
Defaults to True.
TODO Lysandre filled args
""" """
self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
self.transformer.set_num_special_tokens(num_special_tokens) self.transformer.set_num_special_tokens(num_special_tokens)
......
...@@ -496,12 +496,10 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): ...@@ -496,12 +496,10 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
""" """
Update input embeddings with new embedding matrice if needed Update input embeddings with new embedding matrice if needed
TODO
Args: Args:
num_special_tokens: num_special_tokens: Special tokens to be added to the embedding matrix
Returns: TODO Lysandre filled Args
""" """
if num_special_tokens is None or self.config.n_special == num_special_tokens: if num_special_tokens is None or self.config.n_special == num_special_tokens:
...@@ -665,7 +663,13 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): ...@@ -665,7 +663,13 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True): def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
""" """
Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings
TODO
Args:
num_special_tokens: Special tokens to be added to the embedding matrix
predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
Defaults to True.
TODO Lysandre filled Args
""" """
self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
...@@ -775,9 +779,14 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): ...@@ -775,9 +779,14 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
self.apply(self.init_weights) self.apply(self.init_weights)
def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True): def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
""" Update input and output embeddings with new embedding matrice """ Update input and output embeddings with new embedding matrix. Make sure we are sharing the embeddings.
Make sure we are sharing the embeddings
TODO Args:
num_special_tokens: Special tokens to be added to the embedding matrix
predict_special_tokens: if set to True, the model will try and predict the specified ``num_special_tokens``.
Defaults to True.
TODO Lysandre filled Args
""" """
self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
self.transformer.set_num_special_tokens(num_special_tokens) self.transformer.set_num_special_tokens(num_special_tokens)
......
...@@ -623,7 +623,7 @@ class XLNetPreTrainedModel(PreTrainedModel): ...@@ -623,7 +623,7 @@ class XLNetPreTrainedModel(PreTrainedModel):
class XLNetModel(XLNetPreTrainedModel): class XLNetModel(XLNetPreTrainedModel):
"""XLNet model ("XLNet: Generalized Autoregressive Pretraining for Language Understanding"). """XLNet model ("XLNet: Generalized Autoregressive Pretraining for Language Understanding").
TODO: this was copied from the XLNetLMHeadModel, check that it's ok. TODO Lysandre filled: this was copied from the XLNetLMHeadModel, check that it's ok.
Args: Args:
`config`: a XLNetConfig class instance with the configuration to build a new model `config`: a XLNetConfig class instance with the configuration to build a new model
...@@ -631,7 +631,15 @@ class XLNetModel(XLNetPreTrainedModel): ...@@ -631,7 +631,15 @@ class XLNetModel(XLNetPreTrainedModel):
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
This can be used to compute head importance metrics. Default: False This can be used to compute head importance metrics. Default: False
TODO: Add usage
Example::
config = modeling.XLNetConfig(vocab_size_or_config_json_file=32000, d_model=768,
n_layer=12, num_attention_heads=12, intermediate_size=3072)
model = modeling.XLNetModel(config=config)
TODO Lysandre filled: Added example usage
""" """
def __init__(self, config): def __init__(self, config):
super(XLNetModel, self).__init__(config) super(XLNetModel, self).__init__(config)
...@@ -663,8 +671,8 @@ class XLNetModel(XLNetPreTrainedModel): ...@@ -663,8 +671,8 @@ class XLNetModel(XLNetPreTrainedModel):
Creates causal attention mask. Float mask where 1.0 indicates masked, 0.0 indicates not-masked. Creates causal attention mask. Float mask where 1.0 indicates masked, 0.0 indicates not-masked.
Args: Args:
qlen: TODO qlen: TODO Lysandre didn't fill
mlen: TODO mlen: TODO Lysandre didn't fill
:: ::
...@@ -783,19 +791,25 @@ class XLNetModel(XLNetPreTrainedModel): ...@@ -783,19 +791,25 @@ class XLNetModel(XLNetPreTrainedModel):
1 for tokens with losses and 0 for tokens without losses. 1 for tokens with losses and 0 for tokens without losses.
Only used during pretraining for two-stream attention. Only used during pretraining for two-stream attention.
Set to None during finetuning. Set to None during finetuning.
head_mask: TODO Lysandre didn't fill
mem_len: int, the number of tokens to cache.
reuse_len: int, the number of tokens in the currect batch to be cached
and reused in the future.
bi_data: bool, whether to use bidirectional input pipeline.
Usually set to True during pretraining and False during finetuning.
clamp_len: int, clamp all relative distances larger than clamp_len.
-1 means no clamping.
same_length: bool, whether to use the same attention length for each token.
summary_type: str, "last", "first", "mean", or "attn". The method
to pool the input to get a vector representation.
TODO: Add usage Returns:
TODO Lysandre didn't fill: Missing returns!
Example::
# Already been converted into WordPiece token ids
input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])
all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
# or
all_encoder_layers, pooled_output = model.forward(input_ids, token_type_ids, input_mask)
TODO Lysandre filled: Filled with the LMHead example, is probably different since it has a different output
""" """
# the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end # the original code for XLNet uses shapes [len, bsz] with the batch dimension at the end
# but we want a unified interface in the library with the batch size on the first dimension # but we want a unified interface in the library with the batch size on the first dimension
...@@ -951,14 +965,14 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): ...@@ -951,14 +965,14 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
`keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient. `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
This can be used to compute head importance metrics. Default: False This can be used to compute head importance metrics. Default: False
Example:: Example::
config = modeling.XLNetConfig(vocab_size_or_config_json_file=32000, d_model=768, config = modeling.XLNetConfig(vocab_size_or_config_json_file=32000, d_model=768,
n_layer=12, num_attention_heads=12, intermediate_size=3072) n_layer=12, num_attention_heads=12, intermediate_size=3072)
model = modeling.XLNetModel(config=config) model = modeling.XLNetLMHeadModel(config=config)
TODO Lysandre modified: Changed XLNetModel to XLNetLMHeadModel in the example
""" """
def __init__(self, config): def __init__(self, config):
super(XLNetLMHeadModel, self).__init__(config) super(XLNetLMHeadModel, self).__init__(config)
...@@ -1122,7 +1136,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): ...@@ -1122,7 +1136,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
1 for tokens with losses and 0 for tokens without losses. 1 for tokens with losses and 0 for tokens without losses.
Only used during pre-training for two-stream attention. Only used during pre-training for two-stream attention.
Set to None during fine-tuning. Set to None during fine-tuning.
labels: TODO labels: TODO Lysandre didn't fill
head_mask: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1. head_mask: an optional ``torch.Tensor`` of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked. It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment