Commit 3cf12b23 authored by thomwolf's avatar thomwolf
Browse files

added tests + fixed losses

parent eed51c5b
...@@ -549,7 +549,7 @@ class BertPreTrainedModel(nn.Module): ...@@ -549,7 +549,7 @@ class BertPreTrainedModel(nn.Module):
model.__class__.__name__, unexpected_keys)) model.__class__.__name__, unexpected_keys))
if len(error_msgs) > 0: if len(error_msgs) > 0:
raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
self.__class__.__name__, "\n\t".join(error_msgs))) model.__class__.__name__, "\n\t".join(error_msgs)))
if tempdir: if tempdir:
# Clean up temp dir # Clean up temp dir
shutil.rmtree(tempdir) shutil.rmtree(tempdir)
......
...@@ -48,12 +48,10 @@ class OpenAIGPTConfig(object): ...@@ -48,12 +48,10 @@ class OpenAIGPTConfig(object):
n_embd=768, n_embd=768,
n_layer=12, n_layer=12,
n_head=12, n_head=12,
intermediate_size=3072,
afn="gelu", afn="gelu",
resid_pdrop=0.1, resid_pdrop=0.1,
embd_pdrop=0.1, embd_pdrop=0.1,
attn_pdrop=0.1, attn_pdrop=0.1,
type_vocab_size=2,
initializer_range=0.02): initializer_range=0.02):
"""Constructs OpenAIGPTConfig. """Constructs OpenAIGPTConfig.
...@@ -65,8 +63,6 @@ class OpenAIGPTConfig(object): ...@@ -65,8 +63,6 @@ class OpenAIGPTConfig(object):
n_layer: Number of hidden layers in the Transformer encoder. n_layer: Number of hidden layers in the Transformer encoder.
n_head: Number of attention heads for each attention layer in n_head: Number of attention heads for each attention layer in
the Transformer encoder. the Transformer encoder.
intermediate_size: The size of the "intermediate" (i.e., feed-forward)
layer in the Transformer encoder.
afn: The non-linear activation function (function or string) in the afn: The non-linear activation function (function or string) in the
encoder and pooler. If string, "gelu", "relu" and "swish" are supported. encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
resid_pdrop: The dropout probabilitiy for all fully connected resid_pdrop: The dropout probabilitiy for all fully connected
...@@ -74,8 +70,6 @@ class OpenAIGPTConfig(object): ...@@ -74,8 +70,6 @@ class OpenAIGPTConfig(object):
attn_pdrop: The dropout ratio for the attention attn_pdrop: The dropout ratio for the attention
probabilities. probabilities.
embd_pdrop: The dropout ratio for the embeddings. embd_pdrop: The dropout ratio for the embeddings.
type_vocab_size: The vocabulary size of the `token_type_ids` passed into
`OpenAIGPTModel`.
initializer_range: The sttdev of the truncated_normal_initializer for initializer_range: The sttdev of the truncated_normal_initializer for
initializing all weight matrices. initializing all weight matrices.
""" """
...@@ -92,11 +86,9 @@ class OpenAIGPTConfig(object): ...@@ -92,11 +86,9 @@ class OpenAIGPTConfig(object):
self.n_layer = n_layer self.n_layer = n_layer
self.n_head = n_head self.n_head = n_head
self.afn = afn self.afn = afn
self.intermediate_size = intermediate_size
self.resid_pdrop = resid_pdrop self.resid_pdrop = resid_pdrop
self.embd_pdrop = embd_pdrop self.embd_pdrop = embd_pdrop
self.attn_pdrop = attn_pdrop self.attn_pdrop = attn_pdrop
self.type_vocab_size = type_vocab_size
self.initializer_range = initializer_range self.initializer_range = initializer_range
else: else:
raise ValueError("First argument must be either a vocabulary size (int)" raise ValueError("First argument must be either a vocabulary size (int)"
...@@ -133,143 +125,6 @@ class OpenAIGPTConfig(object): ...@@ -133,143 +125,6 @@ class OpenAIGPTConfig(object):
"""Serializes this instance to a JSON string.""" """Serializes this instance to a JSON string."""
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
class OpenAIGPTPreTrainedModel(nn.Module):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
def __init__(self, config, *inputs, **kwargs):
super(OpenAIGPTPreTrainedModel, self).__init__()
if not isinstance(config, OpenAIGPTConfig):
raise ValueError(
"Parameter config in `{}(config)` should be an instance of class `OpenAIGPTConfig`. "
"To create a model from a Google pretrained model use "
"`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
self.__class__.__name__, self.__class__.__name__
))
self.config = config
def init_weights(self, module):
""" Initialize the weights.
"""
if isinstance(module, (nn.Linear, nn.Embedding)):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
elif isinstance(module, LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.data.zero_()
def post_loading(self):
pass
@classmethod
def from_pretrained(cls, pretrained_model_name, state_dict=None, cache_dir=None, *inputs, **kwargs):
"""
Instantiate a OpenAIGPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
Download and cache the pre-trained model file if needed.
Params:
pretrained_model_name: either:
- a str with the name of a pre-trained model to load selected in the list of:
. `openai-gpt`
- a path or url to a pretrained model archive containing:
. `openai_gpt_config.json` a configuration file for the model
. `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
*inputs, **kwargs: additional input for the specific Bert class
(ex: num_labels for BertForSequenceClassification)
"""
if pretrained_model_name in PRETRAINED_MODEL_ARCHIVE_MAP:
archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name]
else:
archive_file = pretrained_model_name
# redirect to the cache, if necessary
try:
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
except FileNotFoundError:
logger.error(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find any file "
"associated to this path or url.".format(
pretrained_model_name,
', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
archive_file))
return None
if resolved_archive_file == archive_file:
logger.info("loading archive file {}".format(archive_file))
else:
logger.info("loading archive file {} from cache at {}".format(
archive_file, resolved_archive_file))
tempdir = None
if os.path.isdir(resolved_archive_file):
serialization_dir = resolved_archive_file
else:
# Extract archive to temp dir
tempdir = tempfile.mkdtemp()
logger.info("extracting archive file {} to temp dir {}".format(
resolved_archive_file, tempdir))
with tarfile.open(resolved_archive_file, 'r:gz') as archive:
archive.extractall(tempdir)
serialization_dir = tempdir
# Load config
config_file = os.path.join(serialization_dir, CONFIG_NAME)
config = OpenAIGPTConfig.from_json_file(config_file)
logger.info("Model config {}".format(config))
# Instantiate model.
model = cls(config, *inputs, **kwargs)
if state_dict is None:
weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
state_dict = torch.load(weights_path)
old_keys = []
new_keys = []
for key in state_dict.keys():
new_key = None
if 'gamma' in key:
new_key = key.replace('gamma', 'weight')
if 'beta' in key:
new_key = key.replace('beta', 'bias')
if new_key:
old_keys.append(key)
new_keys.append(new_key)
for old_key, new_key in zip(old_keys, new_keys):
state_dict[new_key] = state_dict.pop(old_key)
missing_keys = []
unexpected_keys = []
error_msgs = []
# copy state_dict so _load_from_state_dict can modify it
metadata = getattr(state_dict, '_metadata', None)
state_dict = state_dict.copy()
if metadata is not None:
state_dict._metadata = metadata
def load(module, prefix=''):
local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
module._load_from_state_dict(
state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
for name, child in module._modules.items():
if child is not None:
load(child, prefix + name + '.')
load(model.transformer if hasattr(model, 'transformer') else model, prefix='')
if len(missing_keys) > 0:
logger.info("Weights of {} not initialized from pretrained model: {}".format(
model.__class__.__name__, missing_keys))
if len(unexpected_keys) > 0:
logger.info("Weights from pretrained model not used in {}: {}".format(
model.__class__.__name__, unexpected_keys))
if len(error_msgs) > 0:
raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
self.__class__.__name__, "\n\t".join(error_msgs)))
model.post_loading()
if tempdir:
# Clean up temp dir
shutil.rmtree(tempdir)
return model
class Conv1D(nn.Module): class Conv1D(nn.Module):
def __init__(self, nf, rf, nx): def __init__(self, nf, rf, nx):
super(Conv1D, self).__init__() super(Conv1D, self).__init__()
...@@ -312,7 +167,11 @@ class Attention(nn.Module): ...@@ -312,7 +167,11 @@ class Attention(nn.Module):
w = torch.matmul(q, k) w = torch.matmul(q, k)
if self.scale: if self.scale:
w = w / math.sqrt(v.size(-1)) w = w / math.sqrt(v.size(-1))
w = w * self.b + -1e9 * (1 - self.b) # TF implem method: mask_attn_weights # w = w * self.b + -1e9 * (1 - self.b) # TF implem method: mask_attn_weights
# XD: self.b may be larger than w, so we need to crop it
b = self.b[:, :, :w.size(-2), :w.size(-1)]
w = w * b + -1e9 * (1 - b)
w = nn.Softmax(dim=-1)(w) w = nn.Softmax(dim=-1)(w)
w = self.attn_dropout(w) w = self.attn_dropout(w)
return torch.matmul(w, v) return torch.matmul(w, v)
...@@ -388,41 +247,184 @@ class OpenAIGPTLMHead(nn.Module): ...@@ -388,41 +247,184 @@ class OpenAIGPTLMHead(nn.Module):
self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False) self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
self.decoder.weight = model_embeddings_weights # Tied weights self.decoder.weight = model_embeddings_weights # Tied weights
def forward(self, h): def forward(self, hidden_state):
# Truncated Language modeling logits (we remove the last token) # Truncated Language modeling logits (we remove the last token)
h_trunc = h[:, :-1].contiguous().view(-1, self.n_embd) # h_trunc = h[:, :-1].contiguous().view(-1, self.n_embd)
lm_logits = self.decoder(h_trunc) lm_logits = self.decoder(hidden_state)
return lm_logits return lm_logits
class OpenAIGPTClfHead(nn.Module): class OpenAIGPTMultipleChoiceHead(nn.Module):
""" Classifier Head for the transformer """ """ Classifier Head for the transformer """
def __init__(self, clf_token, cfg): def __init__(self, cfg):
super(OpenAIGPTClfHead, self).__init__() super(OpenAIGPTMultipleChoiceHead, self).__init__()
self.n_embd = cfg.n_embd self.n_embd = cfg.n_embd
self.clf_token = clf_token # self.multiple_choice_token = multiple_choice_token
self.dropout = nn.Dropout2d(cfg.resid_pdrop) # To reproduce the noise_shape parameter of TF implementation self.dropout = nn.Dropout2d(cfg.resid_pdrop) # To reproduce the noise_shape parameter of TF implementation
self.linear = nn.Linear(cfg.n_embd, 1) self.linear = nn.Linear(cfg.n_embd, 1)
nn.init.normal_(self.linear.weight, std = 0.02) nn.init.normal_(self.linear.weight, std = 0.02)
nn.init.normal_(self.linear.bias, 0) nn.init.normal_(self.linear.bias, 0)
def forward(self, h, x): def forward(self, hidden_states, classification_token_mask):
# Classification logits # Classification logits
clf_h = h.view(-1, self.n_embd) # hidden_states = hidden_states.view(-1, self.n_embd)
flat = x[..., 0].contiguous().view(-1) # classification_token_mask = classification_token_mask.view(-1, 1).expand_as(hidden_states)
clf_h = clf_h[flat == self.clf_token, :] multiple_choice_h = hidden_states * classification_token_mask.unsqueeze(-1)
clf_h = clf_h.view(-1, x.size(1), self.n_embd, 1) multiple_choice_h = multiple_choice_h.sum(dim=-2)
# This double transposition is there to replicate the behavior # flat = x[..., 0].contiguous().view(-1)
# of the noise_shape argument in the tensorflow # multiple_choice_h = multiple_choice_h[flat == self.multiple_choice_token, :]
# implementation. For more details, see # multiple_choice_h = multiple_choice_h.view(-1, x.size(1), self.n_embd, 1)
# https://github.com/huggingface/pytorch-openai-transformer-lm/issues/11 # # This double transposition is there to replicate the behavior
clf_h = self.dropout(clf_h.transpose(1, 2)).transpose(1, 2) # # of the noise_shape argument in the tensorflow
clf_h = clf_h.contiguous().view(-1, self.n_embd) # # implementation. For more details, see
clf_logits = self.linear(clf_h) # # https://github.com/huggingface/pytorch-openai-transformer-lm/issues/11
# multiple_choice_h = self.dropout(multiple_choice_h.transpose(1, 2)).transpose(1, 2)
# multiple_choice_h = multiple_choice_h.contiguous().view(-1, self.n_embd)
multiple_choice_logits = self.linear(multiple_choice_h).squeeze(-1)
return multiple_choice_logits
class OpenAIGPTPreTrainedModel(nn.Module):
""" An abstract class to handle weights initialization and
a simple interface for dowloading and loading pretrained models.
"""
def __init__(self, config, *inputs, **kwargs):
super(OpenAIGPTPreTrainedModel, self).__init__()
if not isinstance(config, OpenAIGPTConfig):
raise ValueError(
"Parameter config in `{}(config)` should be an instance of class `OpenAIGPTConfig`. "
"To create a model from a pretrained model use "
"`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
self.__class__.__name__, self.__class__.__name__
))
self.config = config
def init_weights(self, module):
""" Initialize the weights.
"""
if isinstance(module, (nn.Linear, nn.Embedding)):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
elif isinstance(module, LayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.data.zero_()
return clf_logits.view(-1, x.size(1)) def set_num_special_tokens(self, num_special_tokens):
pass
@classmethod
def from_pretrained(cls, pretrained_model_name, num_special_tokens=0, state_dict=None, cache_dir=None,
*inputs, **kwargs):
"""
Instantiate a OpenAIGPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
Download and cache the pre-trained model file if needed.
Params:
pretrained_model_name: either:
- a str with the name of a pre-trained model to load selected in the list of:
. `openai-gpt`
- a path or url to a pretrained model archive containing:
. `openai_gpt_config.json` a configuration file for the model
. `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
cache_dir: an optional path to a folder in which the pre-trained models will be cached.
state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
*inputs, **kwargs: additional input for the specific Bert class
(ex: num_labels for BertForSequenceClassification)
"""
if pretrained_model_name in PRETRAINED_MODEL_ARCHIVE_MAP:
archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name]
else:
archive_file = pretrained_model_name
# redirect to the cache, if necessary
try:
resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
except FileNotFoundError:
logger.error(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find any file "
"associated to this path or url.".format(
pretrained_model_name,
', '.join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()),
archive_file))
return None
if resolved_archive_file == archive_file:
logger.info("loading archive file {}".format(archive_file))
else:
logger.info("loading archive file {} from cache at {}".format(
archive_file, resolved_archive_file))
tempdir = None
if os.path.isdir(resolved_archive_file):
serialization_dir = resolved_archive_file
else:
# Extract archive to temp dir
tempdir = tempfile.mkdtemp()
logger.info("extracting archive file {} to temp dir {}".format(
resolved_archive_file, tempdir))
with tarfile.open(resolved_archive_file, 'r:gz') as archive:
archive.extractall(tempdir)
serialization_dir = tempdir
# Load config
config_file = os.path.join(serialization_dir, CONFIG_NAME)
config = OpenAIGPTConfig.from_json_file(config_file)
logger.info("Model config {}".format(config))
# Instantiate model.
model = cls(config, *inputs, **kwargs)
if state_dict is None:
weights_path = os.path.join(serialization_dir, WEIGHTS_NAME)
state_dict = torch.load(weights_path)
old_keys = []
new_keys = []
for key in state_dict.keys():
new_key = None
if 'gamma' in key:
new_key = key.replace('gamma', 'weight')
if 'beta' in key:
new_key = key.replace('beta', 'bias')
if new_key:
old_keys.append(key)
new_keys.append(new_key)
for old_key, new_key in zip(old_keys, new_keys):
state_dict[new_key] = state_dict.pop(old_key)
missing_keys = []
unexpected_keys = []
error_msgs = []
# copy state_dict so _load_from_state_dict can modify it
metadata = getattr(state_dict, '_metadata', None)
state_dict = state_dict.copy()
if metadata is not None:
state_dict._metadata = metadata
def load(module, prefix=''):
local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
module._load_from_state_dict(
state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
for name, child in module._modules.items():
if child is not None:
load(child, prefix + name + '.')
load(model.transformer if hasattr(model, 'transformer') else model, prefix='')
if len(missing_keys) > 0:
logger.info("Weights of {} not initialized from pretrained model: {}".format(
model.__class__.__name__, missing_keys))
if len(unexpected_keys) > 0:
logger.info("Weights from pretrained model not used in {}: {}".format(
model.__class__.__name__, unexpected_keys))
if len(error_msgs) > 0:
raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
model.__class__.__name__, "\n\t".join(error_msgs)))
# Add additional embeddings for special tokens if needed
if num_special_tokens != config.n_special:
model.set_num_special_tokens(num_special_tokens)
if tempdir:
# Clean up temp dir
shutil.rmtree(tempdir)
return model
class OpenAIGPTModel(OpenAIGPTPreTrainedModel): class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
...@@ -440,6 +442,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): ...@@ -440,6 +442,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
# nn.init.normal_(self.embed.weight, std=0.02) # nn.init.normal_(self.embed.weight, std=0.02)
def set_num_special_tokens(self, num_special_tokens): def set_num_special_tokens(self, num_special_tokens):
" Update input embeddings with new embedding matrice "
# Update config # Update config
self.config.n_special = num_special_tokens self.config.n_special = num_special_tokens
# # Build new embeddings and initialize # # Build new embeddings and initialize
...@@ -451,45 +454,83 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): ...@@ -451,45 +454,83 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
self.embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :] self.embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
self.embed.weight.data[-self.config.n_ctx:, :] = old_embed.weight.data[-self.config.n_ctx:, :] self.embed.weight.data[-self.config.n_ctx:, :] = old_embed.weight.data[-self.config.n_ctx:, :]
def forward(self, x): def forward(self, input_ids, position_ids=None, token_type_ids=None):
x = x.view(-1, x.size(-2), x.size(-1)) if position_ids is None:
e = self.embed(x) start = self.config.vocab_size + self.config.n_special
end = start + input_ids.size(-1)
position_ids = torch.arange(start, end, dtype=torch.long, device=input_ids.device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
input_shape = input_ids.size()
input_ids = input_ids.view(-1, input_ids.size(-1))
position_ids = position_ids.view(-1, position_ids.size(-1))
inputs_embeds = self.embed(input_ids)
position_embeds = self.embed(position_ids)
if token_type_ids is not None:
token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
token_type_embeds = self.embed(token_type_ids)
else:
token_type_embeds = 0
# Add the position information to the input embeddings # Add the position information to the input embeddings
h = e.sum(dim=2) # h = e.sum(dim=2)
hidden_states = inputs_embeds + position_embeds + token_type_embeds
for block in self.h: for block in self.h:
h = block(h) hidden_states = block(hidden_states)
return h return hidden_states.view(*input_shape, hidden_states.size(-1))
class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
""" OpenAI GPT model with language model and classification heads """
def __init__(self, cfg):
super(OpenAIGPTLMHeadModel, self).__init__(cfg)
self.transformer = OpenAIGPTModel(cfg)
self.lm_head = OpenAIGPTLMHead(self.transformer.embed.weight, cfg)
self.apply(self.init_weights)
def set_num_special_tokens(self, num_special_tokens):
" Update input and output embeddings with new embedding matrice "
self.transformer.set_num_special_tokens(num_special_tokens)
self.lm_head.set_embeddings_weights(self.transformer.embed.weight)
def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None):
hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
lm_logits = self.lm_head(hidden_states)
if lm_labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(lm_logits, lm_labels)
return loss
return lm_logits
class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
""" OpenAI GPT model with language model and classification heads """ """ OpenAI GPT model with language model and classification heads """
def __init__(self, cfg, clf_token='[CLS]'): def __init__(self, cfg):
super(OpenAIGPTDoubleHeadsModel, self).__init__(cfg) super(OpenAIGPTDoubleHeadsModel, self).__init__(cfg)
self.transformer = OpenAIGPTModel(cfg) self.transformer = OpenAIGPTModel(cfg)
self.lm_head = OpenAIGPTLMHead(self.transformer.embed.weight, cfg) self.lm_head = OpenAIGPTLMHead(self.transformer.embed.weight, cfg)
self.clf_head = OpenAIGPTClfHead(clf_token, cfg) self.multiple_choice_head = OpenAIGPTMultipleChoiceHead(cfg)
self.apply(self.init_weights) self.apply(self.init_weights)
def post_loading(self):
" Set the number of special tokens to 1 (for the [CLS] token) "
self.set_num_special_tokens(1)
def set_num_special_tokens(self, num_special_tokens): def set_num_special_tokens(self, num_special_tokens):
" Update input and output embeddings with new embedding matrice " " Update input and output embeddings with new embedding matrice "
self.transformer.set_num_special_tokens(num_special_tokens) self.transformer.set_num_special_tokens(num_special_tokens)
self.lm_head.set_embeddings_weights(self.transformer.embed.weight) self.lm_head.set_embeddings_weights(self.transformer.embed.weight)
def forward(self, x, lm_labels=None, clf_labels=None): def forward(self, input_ids, classification_token_mask, position_ids=None, token_type_ids=None,
h = self.transformer(x) lm_labels=None, multiple_choice_labels=None):
lm_logits = self.lm_head(h) """
clf_logits = self.clf_head(h, x) input_ids as to be of shape B x C x S
lm_labels can be masked using the -1 value
"""
hidden_states = self.transformer(input_ids, position_ids, token_type_ids)
lm_logits = self.lm_head(hidden_states)
multiple_choice_logits = self.multiple_choice_head(hidden_states, classification_token_mask)
losses = [] losses = []
if lm_labels is not None: if lm_labels is not None:
loss_fct = CrossEntropyLoss(ignore_index=-1)
losses.append(loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1)))
if multiple_choice_labels is not None:
loss_fct = CrossEntropyLoss() loss_fct = CrossEntropyLoss()
losses.append(loss_fct(lm_logits, lm_labels)) losses.append(loss_fct(multiple_choice_logits, multiple_choice_labels.view(-1)))
if clf_labels is not None:
loss_fct = CrossEntropyLoss()
losses.append(loss_fct(clf_logits, clf_labels))
if losses: if losses:
return losses return losses
return lm_logits, clf_logits return lm_logits, multiple_choice_logits
...@@ -67,19 +67,17 @@ class OpenAIGPTTokenizer(object): ...@@ -67,19 +67,17 @@ class OpenAIGPTTokenizer(object):
mostly a wrapper for a public python bpe tokenizer mostly a wrapper for a public python bpe tokenizer
""" """
@classmethod @classmethod
def from_pretrained(cls, pretrained_model_name, cache_dir=None, *inputs, **kwargs): def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
""" """
Instantiate a PreTrainedBertModel from a pre-trained model file. Instantiate a PreTrainedBertModel from a pre-trained model file.
Download and cache the pre-trained model file if needed. Download and cache the pre-trained model file if needed.
""" """
if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP: if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name] vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name] merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
else: else:
vocab_file = pretrained_model_name vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
if os.path.isdir(vocab_file): merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
vocab_file = os.path.join(vocab_file, VOCAB_NAME)
merges_file = os.path.join(vocab_file, MERGES_NAME)
# redirect to the cache, if necessary # redirect to the cache, if necessary
try: try:
resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
...@@ -87,11 +85,12 @@ class OpenAIGPTTokenizer(object): ...@@ -87,11 +85,12 @@ class OpenAIGPTTokenizer(object):
except FileNotFoundError: except FileNotFoundError:
logger.error( logger.error(
"Model name '{}' was not found in model name list ({}). " "Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find any file " "We assumed '{}' was a path or url but couldn't find files {} and {} "
"associated to this path or url.".format( "at this path or url.".format(
pretrained_model_name, pretrained_model_name_or_path,
', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
vocab_file)) pretrained_model_name_or_path,
vocab_file, merges_file))
return None return None
if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file: if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
logger.info("loading vocabulary file {}".format(vocab_file)) logger.info("loading vocabulary file {}".format(vocab_file))
...@@ -101,29 +100,38 @@ class OpenAIGPTTokenizer(object): ...@@ -101,29 +100,38 @@ class OpenAIGPTTokenizer(object):
vocab_file, resolved_vocab_file)) vocab_file, resolved_vocab_file))
logger.info("loading merges file {} from cache at {}".format( logger.info("loading merges file {} from cache at {}".format(
merges_file, resolved_merges_file)) merges_file, resolved_merges_file))
if pretrained_model_name in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP: if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
# if we're using a pretrained model, ensure the tokenizer wont index sequences longer # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
# than the number of positional embeddings # than the number of positional embeddings
max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name] max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
# Instantiate tokenizer. # Instantiate tokenizer.
tokenizer = cls(resolved_vocab_file, resolved_merges_file, *inputs, **kwargs) tokenizer = cls(resolved_vocab_file, resolved_merges_file, *inputs, **kwargs)
return tokenizer return tokenizer
def __init__(self, vocab_file, merges_file): def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None):
try: try:
import ftfy import ftfy
import spacy import spacy
except ImportError: except ImportError:
raise ImportError("Please install ftfy and spacy to use OpenAI GPT tokenizer.") raise ImportError("Please install ftfy and spacy to use OpenAI GPT tokenizer.")
self.max_len = max_len if max_len is not None else int(1e12)
self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat']) self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
self.fix_text = ftfy.fix_text
self.encoder = json.load(open(vocab_file)) self.encoder = json.load(open(vocab_file))
self.decoder = {v:k for k,v in self.encoder.items()} self.decoder = {v:k for k,v in self.encoder.items()}
merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1] merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
merges = [tuple(merge.split()) for merge in merges] merges = [tuple(merge.split()) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {} self.cache = {}
if not special_tokens:
self.special_tokens = {}
else:
self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
def set_special_tokens(self, special_tokens):
self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
def bpe(self, token): def bpe(self, token):
word = tuple(token[:-1]) + ( token[-1] + '</w>',) word = tuple(token[:-1]) + ( token[-1] + '</w>',)
...@@ -168,20 +176,38 @@ class OpenAIGPTTokenizer(object): ...@@ -168,20 +176,38 @@ class OpenAIGPTTokenizer(object):
self.cache[token] = word self.cache[token] = word
return word return word
def tokenize(self, texts, verbose=True): def tokenize(self, text):
texts_tokens = [] split_tokens = []
if verbose: text = self.nlp(text_standardize(self.fix_text(text)))
for text in tqdm(texts, ncols=80, leave=False): for token in text:
text = self.nlp(text_standardize(ftfy.fix_text(text))) split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')])
text_tokens = [] return split_tokens
for token in text:
text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')]) def convert_tokens_to_ids(self, tokens):
texts_tokens.append(text_tokens) """Converts a sequence of tokens into ids using the vocab."""
else: ids = []
for text in texts: for token in tokens:
text = self.nlp(text_standardize(ftfy.fix_text(text))) if token in self.special_tokens:
text_tokens = [] ids.append(self.special_tokens[token])
for token in text: else:
text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')]) ids.append(self.encoder.get(token, 0))
texts_tokens.append(text_tokens) if len(ids) > self.max_len:
return texts_tokens raise ValueError(
"Token indices sequence length is longer than the specified maximum "
" sequence length for this BERT model ({} > {}). Running this"
" sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
)
return ids
def convert_ids_to_tokens(self, ids):
"""Converts a sequence of ids in BPE tokens using the vocab."""
tokens = []
for i in ids:
tokens.append(self.decoder[i])
return tokens
def decode(self, ids):
"""Converts a sequence of ids in a string."""
tokens = self.convert_ids_to_tokens(ids)
out_string = ''.join(tokens).replace('</w>', ' ')
return out_string
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import unittest
import json
import random
import torch
from pytorch_pretrained_bert import (OpenAIGPTConfig, OpenAIGPTModel, OpenAIGPTDoubleHeadsModel)
class OpenAIGPTModelTest(unittest.TestCase):
class OpenAIGPTModelTester(object):
def __init__(self,
parent,
batch_size=13,
seq_length=7,
is_training=True,
use_position_ids=True,
use_token_type_ids=True,
use_labels=True,
vocab_size=99,
n_special=1,
n_ctx=33,
n_embd=32,
n_layer=5,
n_head=4,
n_choices=3,
afn="gelu",
resid_pdrop=0.1,
attn_pdrop=0.1,
embd_pdrop=0.1,
type_sequence_label_size=2,
initializer_range=0.02,
num_labels=3,
scope=None):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_position_ids = use_position_ids
self.use_token_type_ids = use_token_type_ids
self.use_labels = use_labels
self.vocab_size = vocab_size
self.n_special = n_special
self.n_ctx = n_ctx
self.n_embd = n_embd
self.n_layer = n_layer
self.n_head = n_head
self.afn = afn
self.n_choices = n_choices
self.resid_pdrop = resid_pdrop
self.attn_pdrop = attn_pdrop
self.embd_pdrop = embd_pdrop
self.type_sequence_label_size = type_sequence_label_size
self.initializer_range = initializer_range
self.num_labels = num_labels
self.scope = scope
def prepare_config_and_inputs(self):
input_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.vocab_size)
position_ids = None
if self.use_position_ids:
position_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_ctx)
position_ids = position_ids + self.n_special + self.vocab_size
token_type_ids = None
if self.use_token_type_ids:
total_voc = self.n_ctx + self.n_special + self.vocab_size
token_type_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc)
multiple_choice_labels = None
lm_labels = None
classification_token_mask = None
if self.use_labels:
multiple_choice_labels = OpenAIGPTModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size)
lm_labels = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels)
classification_token_mask = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], 2).float()
config = OpenAIGPTConfig(
vocab_size_or_config_json_file=self.vocab_size,
n_ctx=self.n_ctx,
n_special=self.n_special,
n_embd=self.n_embd,
n_layer=self.n_layer,
n_head=self.n_head,
afn=self.afn,
resid_pdrop=self.resid_pdrop,
attn_pdrop=self.attn_pdrop,
embd_pdrop=self.embd_pdrop,
initializer_range=self.initializer_range)
return (config, input_ids, token_type_ids, position_ids,
multiple_choice_labels, lm_labels, classification_token_mask)
def create_openai_model(self, config, input_ids, token_type_ids, position_ids,
multiple_choice_labels, lm_labels, classification_token_mask):
model = OpenAIGPTModel(config)
hidden_states = model(input_ids, position_ids, token_type_ids)
outputs = {
"hidden_states": hidden_states,
}
return outputs
def check_openai_model_output(self, result):
self.parent.assertListEqual(
list(result["hidden_states"].size()),
[self.batch_size, self.n_choices, self.seq_length, self.n_embd])
def create_openai_double_heads(self, config, input_ids, token_type_ids, position_ids,
multiple_choice_labels, lm_labels, classification_token_mask):
model = OpenAIGPTDoubleHeadsModel(config)
loss = model(input_ids, classification_token_mask, position_ids,
token_type_ids, lm_labels, multiple_choice_labels)
lm_logits, multiple_choice_logits = model(input_ids, classification_token_mask, position_ids, token_type_ids)
outputs = {
"loss": loss,
"lm_logits": lm_logits,
"multiple_choice_logits": multiple_choice_logits,
}
return outputs
def check_openai_double_heads_output(self, result):
total_voc = self.n_ctx + self.n_special + self.vocab_size
self.parent.assertListEqual(
list(result["lm_logits"].size()),
[self.batch_size, self.n_choices, self.seq_length, total_voc])
self.parent.assertListEqual(
list(result["multiple_choice_logits"].size()),
[self.batch_size, self.n_choices])
def check_openai_double_heads_loss_output(self, result):
self.parent.assertListEqual(
[list(l.size()) for l in result["loss"]],
[[], []])
def test_default(self):
self.run_tester(OpenAIGPTModelTest.OpenAIGPTModelTester(self))
def test_config_to_json_string(self):
config = OpenAIGPTConfig(vocab_size_or_config_json_file=99, n_embd=37)
obj = json.loads(config.to_json_string())
self.assertEqual(obj["vocab_size"], 99)
self.assertEqual(obj["n_embd"], 37)
def run_tester(self, tester):
config_and_inputs = tester.prepare_config_and_inputs()
output_result = tester.create_openai_model(*config_and_inputs)
tester.check_openai_model_output(output_result)
output_result = tester.create_openai_double_heads(*config_and_inputs)
tester.check_openai_double_heads_output(output_result)
tester.check_openai_double_heads_loss_output(output_result)
@classmethod
def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
"""Creates a random int32 tensor of the shape within the vocab size."""
if rng is None:
rng = random.Random()
total_dims = 1
for dim in shape:
total_dims *= dim
values = []
for _ in range(total_dims):
values.append(rng.randint(0, vocab_size - 1))
return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment