Commit 3cf12b23 authored by thomwolf's avatar thomwolf
Browse files

added tests + fixed losses

parent eed51c5b
......@@ -549,7 +549,7 @@ class BertPreTrainedModel(nn.Module):
model.__class__.__name__, unexpected_keys))
if len(error_msgs) > 0:
raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
self.__class__.__name__, "\n\t".join(error_msgs)))
model.__class__.__name__, "\n\t".join(error_msgs)))
if tempdir:
# Clean up temp dir
shutil.rmtree(tempdir)
......
This diff is collapsed.
......@@ -67,19 +67,17 @@ class OpenAIGPTTokenizer(object):
mostly a wrapper for a public python bpe tokenizer
"""
@classmethod
def from_pretrained(cls, pretrained_model_name, cache_dir=None, *inputs, **kwargs):
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
"""
Instantiate a PreTrainedBertModel from a pre-trained model file.
Download and cache the pre-trained model file if needed.
"""
if pretrained_model_name in PRETRAINED_VOCAB_ARCHIVE_MAP:
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name]
merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name]
if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
else:
vocab_file = pretrained_model_name
if os.path.isdir(vocab_file):
vocab_file = os.path.join(vocab_file, VOCAB_NAME)
merges_file = os.path.join(vocab_file, MERGES_NAME)
vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
# redirect to the cache, if necessary
try:
resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
......@@ -87,11 +85,12 @@ class OpenAIGPTTokenizer(object):
except FileNotFoundError:
logger.error(
"Model name '{}' was not found in model name list ({}). "
"We assumed '{}' was a path or url but couldn't find any file "
"associated to this path or url.".format(
pretrained_model_name,
"We assumed '{}' was a path or url but couldn't find files {} and {} "
"at this path or url.".format(
pretrained_model_name_or_path,
', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
vocab_file))
pretrained_model_name_or_path,
vocab_file, merges_file))
return None
if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
logger.info("loading vocabulary file {}".format(vocab_file))
......@@ -101,29 +100,38 @@ class OpenAIGPTTokenizer(object):
vocab_file, resolved_vocab_file))
logger.info("loading merges file {} from cache at {}".format(
merges_file, resolved_merges_file))
if pretrained_model_name in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
# if we're using a pretrained model, ensure the tokenizer wont index sequences longer
# than the number of positional embeddings
max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name]
max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
# Instantiate tokenizer.
tokenizer = cls(resolved_vocab_file, resolved_merges_file, *inputs, **kwargs)
return tokenizer
def __init__(self, vocab_file, merges_file):
def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None):
try:
import ftfy
import spacy
except ImportError:
raise ImportError("Please install ftfy and spacy to use OpenAI GPT tokenizer.")
self.max_len = max_len if max_len is not None else int(1e12)
self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
self.fix_text = ftfy.fix_text
self.encoder = json.load(open(vocab_file))
self.decoder = {v:k for k,v in self.encoder.items()}
merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
merges = [tuple(merge.split()) for merge in merges]
self.bpe_ranks = dict(zip(merges, range(len(merges))))
self.cache = {}
if not special_tokens:
self.special_tokens = {}
else:
self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
def set_special_tokens(self, special_tokens):
self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
def bpe(self, token):
word = tuple(token[:-1]) + ( token[-1] + '</w>',)
......@@ -168,20 +176,38 @@ class OpenAIGPTTokenizer(object):
self.cache[token] = word
return word
def tokenize(self, texts, verbose=True):
texts_tokens = []
if verbose:
for text in tqdm(texts, ncols=80, leave=False):
text = self.nlp(text_standardize(ftfy.fix_text(text)))
text_tokens = []
def tokenize(self, text):
split_tokens = []
text = self.nlp(text_standardize(self.fix_text(text)))
for token in text:
text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')])
texts_tokens.append(text_tokens)
split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')])
return split_tokens
def convert_tokens_to_ids(self, tokens):
"""Converts a sequence of tokens into ids using the vocab."""
ids = []
for token in tokens:
if token in self.special_tokens:
ids.append(self.special_tokens[token])
else:
for text in texts:
text = self.nlp(text_standardize(ftfy.fix_text(text)))
text_tokens = []
for token in text:
text_tokens.extend([self.encoder.get(t, 0) for t in self.bpe(token.text.lower()).split(' ')])
texts_tokens.append(text_tokens)
return texts_tokens
ids.append(self.encoder.get(token, 0))
if len(ids) > self.max_len:
raise ValueError(
"Token indices sequence length is longer than the specified maximum "
" sequence length for this BERT model ({} > {}). Running this"
" sequence through BERT will result in indexing errors".format(len(ids), self.max_len)
)
return ids
def convert_ids_to_tokens(self, ids):
"""Converts a sequence of ids in BPE tokens using the vocab."""
tokens = []
for i in ids:
tokens.append(self.decoder[i])
return tokens
def decode(self, ids):
"""Converts a sequence of ids in a string."""
tokens = self.convert_ids_to_tokens(ids)
out_string = ''.join(tokens).replace('</w>', ' ')
return out_string
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import unittest
import json
import random
import torch
from pytorch_pretrained_bert import (OpenAIGPTConfig, OpenAIGPTModel, OpenAIGPTDoubleHeadsModel)
class OpenAIGPTModelTest(unittest.TestCase):
class OpenAIGPTModelTester(object):
def __init__(self,
parent,
batch_size=13,
seq_length=7,
is_training=True,
use_position_ids=True,
use_token_type_ids=True,
use_labels=True,
vocab_size=99,
n_special=1,
n_ctx=33,
n_embd=32,
n_layer=5,
n_head=4,
n_choices=3,
afn="gelu",
resid_pdrop=0.1,
attn_pdrop=0.1,
embd_pdrop=0.1,
type_sequence_label_size=2,
initializer_range=0.02,
num_labels=3,
scope=None):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_position_ids = use_position_ids
self.use_token_type_ids = use_token_type_ids
self.use_labels = use_labels
self.vocab_size = vocab_size
self.n_special = n_special
self.n_ctx = n_ctx
self.n_embd = n_embd
self.n_layer = n_layer
self.n_head = n_head
self.afn = afn
self.n_choices = n_choices
self.resid_pdrop = resid_pdrop
self.attn_pdrop = attn_pdrop
self.embd_pdrop = embd_pdrop
self.type_sequence_label_size = type_sequence_label_size
self.initializer_range = initializer_range
self.num_labels = num_labels
self.scope = scope
def prepare_config_and_inputs(self):
input_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.vocab_size)
position_ids = None
if self.use_position_ids:
position_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.n_ctx)
position_ids = position_ids + self.n_special + self.vocab_size
token_type_ids = None
if self.use_token_type_ids:
total_voc = self.n_ctx + self.n_special + self.vocab_size
token_type_ids = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], total_voc)
multiple_choice_labels = None
lm_labels = None
classification_token_mask = None
if self.use_labels:
multiple_choice_labels = OpenAIGPTModelTest.ids_tensor([self.batch_size], self.type_sequence_label_size)
lm_labels = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], self.num_labels)
classification_token_mask = OpenAIGPTModelTest.ids_tensor([self.batch_size, self.n_choices, self.seq_length], 2).float()
config = OpenAIGPTConfig(
vocab_size_or_config_json_file=self.vocab_size,
n_ctx=self.n_ctx,
n_special=self.n_special,
n_embd=self.n_embd,
n_layer=self.n_layer,
n_head=self.n_head,
afn=self.afn,
resid_pdrop=self.resid_pdrop,
attn_pdrop=self.attn_pdrop,
embd_pdrop=self.embd_pdrop,
initializer_range=self.initializer_range)
return (config, input_ids, token_type_ids, position_ids,
multiple_choice_labels, lm_labels, classification_token_mask)
def create_openai_model(self, config, input_ids, token_type_ids, position_ids,
multiple_choice_labels, lm_labels, classification_token_mask):
model = OpenAIGPTModel(config)
hidden_states = model(input_ids, position_ids, token_type_ids)
outputs = {
"hidden_states": hidden_states,
}
return outputs
def check_openai_model_output(self, result):
self.parent.assertListEqual(
list(result["hidden_states"].size()),
[self.batch_size, self.n_choices, self.seq_length, self.n_embd])
def create_openai_double_heads(self, config, input_ids, token_type_ids, position_ids,
multiple_choice_labels, lm_labels, classification_token_mask):
model = OpenAIGPTDoubleHeadsModel(config)
loss = model(input_ids, classification_token_mask, position_ids,
token_type_ids, lm_labels, multiple_choice_labels)
lm_logits, multiple_choice_logits = model(input_ids, classification_token_mask, position_ids, token_type_ids)
outputs = {
"loss": loss,
"lm_logits": lm_logits,
"multiple_choice_logits": multiple_choice_logits,
}
return outputs
def check_openai_double_heads_output(self, result):
total_voc = self.n_ctx + self.n_special + self.vocab_size
self.parent.assertListEqual(
list(result["lm_logits"].size()),
[self.batch_size, self.n_choices, self.seq_length, total_voc])
self.parent.assertListEqual(
list(result["multiple_choice_logits"].size()),
[self.batch_size, self.n_choices])
def check_openai_double_heads_loss_output(self, result):
self.parent.assertListEqual(
[list(l.size()) for l in result["loss"]],
[[], []])
def test_default(self):
self.run_tester(OpenAIGPTModelTest.OpenAIGPTModelTester(self))
def test_config_to_json_string(self):
config = OpenAIGPTConfig(vocab_size_or_config_json_file=99, n_embd=37)
obj = json.loads(config.to_json_string())
self.assertEqual(obj["vocab_size"], 99)
self.assertEqual(obj["n_embd"], 37)
def run_tester(self, tester):
config_and_inputs = tester.prepare_config_and_inputs()
output_result = tester.create_openai_model(*config_and_inputs)
tester.check_openai_model_output(output_result)
output_result = tester.create_openai_double_heads(*config_and_inputs)
tester.check_openai_double_heads_output(output_result)
tester.check_openai_double_heads_loss_output(output_result)
@classmethod
def ids_tensor(cls, shape, vocab_size, rng=None, name=None):
"""Creates a random int32 tensor of the shape within the vocab size."""
if rng is None:
rng = random.Random()
total_dims = 1
for dim in shape:
total_dims *= dim
values = []
for _ in range(total_dims):
values.append(rng.randint(0, vocab_size - 1))
return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment