"...lm-evaluation-harness.git" did not exist on "d1e7a30a13a709148f78df214b33a41c89a2f9db"
Commit 982f181a authored by erenup's avatar erenup
Browse files

Merge remote-tracking branch 'origin/master' into run_multiple_choice_add_doc

parents 603b470a 84b9d1c4
...@@ -26,7 +26,8 @@ from pytorch_transformers import (BertConfig, BertModel, BertForMaskedLM, ...@@ -26,7 +26,8 @@ from pytorch_transformers import (BertConfig, BertModel, BertForMaskedLM,
BertForTokenClassification, BertForMultipleChoice) BertForTokenClassification, BertForMultipleChoice)
from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor) from .modeling_common_test import (CommonTestCases, ids_tensor)
from .configuration_common_test import ConfigTester
class BertModelTest(CommonTestCases.CommonModelTester): class BertModelTest(CommonTestCases.CommonModelTester):
...@@ -126,8 +127,8 @@ class BertModelTest(CommonTestCases.CommonModelTester): ...@@ -126,8 +127,8 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = BertModel(config=config) model = BertModel(config=config)
model.eval() model.eval()
sequence_output, pooled_output = model(input_ids, token_type_ids, input_mask) sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
sequence_output, pooled_output = model(input_ids, token_type_ids) sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
sequence_output, pooled_output = model(input_ids) sequence_output, pooled_output = model(input_ids)
result = { result = {
...@@ -143,7 +144,7 @@ class BertModelTest(CommonTestCases.CommonModelTester): ...@@ -143,7 +144,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = BertForMaskedLM(config=config) model = BertForMaskedLM(config=config)
model.eval() model.eval()
loss, prediction_scores = model(input_ids, token_type_ids, input_mask, token_labels) loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
result = { result = {
"loss": loss, "loss": loss,
"prediction_scores": prediction_scores, "prediction_scores": prediction_scores,
...@@ -156,7 +157,7 @@ class BertModelTest(CommonTestCases.CommonModelTester): ...@@ -156,7 +157,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = BertForNextSentencePrediction(config=config) model = BertForNextSentencePrediction(config=config)
model.eval() model.eval()
loss, seq_relationship_score = model(input_ids, token_type_ids, input_mask, sequence_labels) loss, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, next_sentence_label=sequence_labels)
result = { result = {
"loss": loss, "loss": loss,
"seq_relationship_score": seq_relationship_score, "seq_relationship_score": seq_relationship_score,
...@@ -170,7 +171,8 @@ class BertModelTest(CommonTestCases.CommonModelTester): ...@@ -170,7 +171,8 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = BertForPreTraining(config=config) model = BertForPreTraining(config=config)
model.eval() model.eval()
loss, prediction_scores, seq_relationship_score = model(input_ids, token_type_ids, input_mask, token_labels, sequence_labels) loss, prediction_scores, seq_relationship_score = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
masked_lm_labels=token_labels, next_sentence_label=sequence_labels)
result = { result = {
"loss": loss, "loss": loss,
"prediction_scores": prediction_scores, "prediction_scores": prediction_scores,
...@@ -188,7 +190,8 @@ class BertModelTest(CommonTestCases.CommonModelTester): ...@@ -188,7 +190,8 @@ class BertModelTest(CommonTestCases.CommonModelTester):
def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = BertForQuestionAnswering(config=config) model = BertForQuestionAnswering(config=config)
model.eval() model.eval()
loss, start_logits, end_logits = model(input_ids, token_type_ids, input_mask, sequence_labels, sequence_labels) loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
start_positions=sequence_labels, end_positions=sequence_labels)
result = { result = {
"loss": loss, "loss": loss,
"start_logits": start_logits, "start_logits": start_logits,
...@@ -207,7 +210,7 @@ class BertModelTest(CommonTestCases.CommonModelTester): ...@@ -207,7 +210,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = BertForSequenceClassification(config) model = BertForSequenceClassification(config)
model.eval() model.eval()
loss, logits = model(input_ids, token_type_ids, input_mask, sequence_labels) loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
result = { result = {
"loss": loss, "loss": loss,
"logits": logits, "logits": logits,
...@@ -222,7 +225,7 @@ class BertModelTest(CommonTestCases.CommonModelTester): ...@@ -222,7 +225,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = BertForTokenClassification(config=config) model = BertForTokenClassification(config=config)
model.eval() model.eval()
loss, logits = model(input_ids, token_type_ids, input_mask, token_labels) loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
result = { result = {
"loss": loss, "loss": loss,
"logits": logits, "logits": logits,
...@@ -241,9 +244,9 @@ class BertModelTest(CommonTestCases.CommonModelTester): ...@@ -241,9 +244,9 @@ class BertModelTest(CommonTestCases.CommonModelTester):
multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous() multiple_choice_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
loss, logits = model(multiple_choice_inputs_ids, loss, logits = model(multiple_choice_inputs_ids,
multiple_choice_token_type_ids, attention_mask=multiple_choice_input_mask,
multiple_choice_input_mask, token_type_ids=multiple_choice_token_type_ids,
choice_labels) labels=choice_labels)
result = { result = {
"loss": loss, "loss": loss,
"logits": logits, "logits": logits,
......
...@@ -28,9 +28,9 @@ import logging ...@@ -28,9 +28,9 @@ import logging
import torch import torch
from pytorch_transformers import PretrainedConfig, PreTrainedModel from pytorch_transformers import (PretrainedConfig, PreTrainedModel,
from pytorch_transformers.modeling_bert import BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP BertModel, BertConfig, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
from pytorch_transformers.modeling_gpt2 import GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP GPT2LMHeadModel, GPT2Config, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
def _config_zero_init(config): def _config_zero_init(config):
...@@ -163,7 +163,9 @@ class CommonTestCases: ...@@ -163,7 +163,9 @@ class CommonTestCases:
if not self.test_head_masking: if not self.test_head_masking:
return return
global_rng.seed(42)
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
global_rng.seed()
config.output_attentions = True config.output_attentions = True
config.output_hidden_states = True config.output_hidden_states = True
...@@ -173,7 +175,7 @@ class CommonTestCases: ...@@ -173,7 +175,7 @@ class CommonTestCases:
model.eval() model.eval()
# Prepare head_mask # Prepare head_mask
# Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior) # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads) head_mask = torch.ones(self.model_tester.num_hidden_layers, self.model_tester.num_attention_heads)
head_mask[0, 0] = 0 head_mask[0, 0] = 0
head_mask[-1, :-1] = 0 head_mask[-1, :-1] = 0
...@@ -212,9 +214,12 @@ class CommonTestCases: ...@@ -212,9 +214,12 @@ class CommonTestCases:
if not self.test_pruning: if not self.test_pruning:
return return
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes: for model_class in self.all_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
if "head_mask" in inputs_dict:
del inputs_dict["head_mask"]
config.output_attentions = True config.output_attentions = True
config.output_hidden_states = False config.output_hidden_states = False
model = model_class(config=config) model = model_class(config=config)
...@@ -233,6 +238,120 @@ class CommonTestCases: ...@@ -233,6 +238,120 @@ class CommonTestCases:
self.assertEqual( self.assertEqual(
attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1) attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
def test_head_pruning_save_load_from_pretrained(self):
if not self.test_pruning:
return
for model_class in self.all_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
if "head_mask" in inputs_dict:
del inputs_dict["head_mask"]
config.output_attentions = True
config.output_hidden_states = False
model = model_class(config=config)
model.eval()
heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
-1: [0]}
model.prune_heads(heads_to_prune)
directory = "pruned_model"
if not os.path.exists(directory):
os.makedirs(directory)
model.save_pretrained(directory)
model = model_class.from_pretrained(directory)
outputs = model(**inputs_dict)
attentions = outputs[-1]
self.assertEqual(attentions[0].shape[-3], 1)
self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
shutil.rmtree(directory)
def test_head_pruning_save_load_from_config_init(self):
if not self.test_pruning:
return
for model_class in self.all_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
if "head_mask" in inputs_dict:
del inputs_dict["head_mask"]
config.output_attentions = True
config.output_hidden_states = False
heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
-1: [0]}
config.pruned_heads = heads_to_prune
model = model_class(config=config)
model.eval()
outputs = model(**inputs_dict)
attentions = outputs[-1]
self.assertEqual(attentions[0].shape[-3], 1)
self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1)
def test_head_pruning_integration(self):
if not self.test_pruning:
return
for model_class in self.all_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
if "head_mask" in inputs_dict:
del inputs_dict["head_mask"]
config.output_attentions = True
config.output_hidden_states = False
heads_to_prune = {0: [0], 1: [1, 2]}
config.pruned_heads = heads_to_prune
model = model_class(config=config)
model.eval()
outputs = model(**inputs_dict)
attentions = outputs[-1]
self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
directory = "pruned_model"
if not os.path.exists(directory):
os.makedirs(directory)
model.save_pretrained(directory)
model = model_class.from_pretrained(directory)
shutil.rmtree(directory)
outputs = model(**inputs_dict)
attentions = outputs[-1]
self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads)
self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
heads_to_prune = {0: [0], 2: [1, 2]}
model.prune_heads(heads_to_prune)
outputs = model(**inputs_dict)
attentions = outputs[-1]
self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads -1)
self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads - 2)
self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads - 2)
self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads)
self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]})
def test_hidden_states_output(self): def test_hidden_states_output(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
...@@ -547,12 +666,13 @@ class ConfigTester(object): ...@@ -547,12 +666,13 @@ class ConfigTester(object):
self.create_and_test_config_to_json_file() self.create_and_test_config_to_json_file()
global_rng = random.Random()
def ids_tensor(shape, vocab_size, rng=None, name=None): def ids_tensor(shape, vocab_size, rng=None, name=None):
"""Creates a random int32 tensor of the shape within the vocab size.""" """Creates a random int32 tensor of the shape within the vocab size."""
if rng is None: if rng is None:
rng = random.Random() rng = global_rng
total_dims = 1 total_dims = 1
for dim in shape: for dim in shape:
......
...@@ -17,14 +17,12 @@ from __future__ import division ...@@ -17,14 +17,12 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import unittest import unittest
import shutil
import pytest
from pytorch_transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM, from pytorch_transformers import (DistilBertConfig, DistilBertModel, DistilBertForMaskedLM,
DistilBertForQuestionAnswering, DistilBertForSequenceClassification) DistilBertForQuestionAnswering, DistilBertForSequenceClassification)
from pytorch_transformers.modeling_distilbert import DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor) from .modeling_common_test import (CommonTestCases, ids_tensor)
from .configuration_common_test import ConfigTester
class DistilBertModelTest(CommonTestCases.CommonModelTester): class DistilBertModelTest(CommonTestCases.CommonModelTester):
...@@ -148,7 +146,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester): ...@@ -148,7 +146,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
def create_and_check_distilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels): def create_and_check_distilbert_for_question_answering(self, config, input_ids, input_mask, sequence_labels, token_labels, choice_labels):
model = DistilBertForQuestionAnswering(config=config) model = DistilBertForQuestionAnswering(config=config)
model.eval() model.eval()
loss, start_logits, end_logits = model(input_ids, input_mask, sequence_labels, sequence_labels) loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, start_positions=sequence_labels, end_positions=sequence_labels)
result = { result = {
"loss": loss, "loss": loss,
"start_logits": start_logits, "start_logits": start_logits,
...@@ -166,7 +164,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester): ...@@ -166,7 +164,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
config.num_labels = self.num_labels config.num_labels = self.num_labels
model = DistilBertForSequenceClassification(config) model = DistilBertForSequenceClassification(config)
model.eval() model.eval()
loss, logits = model(input_ids, input_mask, sequence_labels) loss, logits = model(input_ids, attention_mask=input_mask, labels=sequence_labels)
result = { result = {
"loss": loss, "loss": loss,
"logits": logits, "logits": logits,
......
...@@ -18,31 +18,197 @@ from __future__ import print_function ...@@ -18,31 +18,197 @@ from __future__ import print_function
import unittest import unittest
import pytest import pytest
import shutil
from pytorch_transformers import (GPT2Config, GPT2Model, from pytorch_transformers import (GPT2Config, GPT2Model, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
GPT2LMHeadModel, GPT2DoubleHeadsModel) GPT2LMHeadModel, GPT2DoubleHeadsModel)
from .modeling_common_test import CommonTestCases, ConfigTester from .modeling_common_test import (CommonTestCases, ids_tensor)
from .configuration_common_test import ConfigTester
class GPT2ModelTest(unittest.TestCase):
class GPT2ModelTest(CommonTestCases.CommonModelTester):
all_model_classes = (GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel)
class GPT2ModelTester(object):
def __init__(self,
parent,
batch_size=13,
seq_length=7,
is_training=True,
use_token_type_ids=True,
use_labels=True,
vocab_size=99,
hidden_size=32,
num_hidden_layers=5,
num_attention_heads=4,
intermediate_size=37,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=16,
type_sequence_label_size=2,
initializer_range=0.02,
num_labels=3,
num_choices=4,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_token_type_ids = use_token_type_ids
self.use_labels = use_labels
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.type_sequence_label_size = type_sequence_label_size
self.initializer_range = initializer_range
self.num_labels = num_labels
self.num_choices = num_choices
self.scope = scope
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
token_type_ids = None
if self.use_token_type_ids:
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
sequence_labels = None
token_labels = None
choice_labels = None
if self.use_labels:
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
choice_labels = ids_tensor([self.batch_size], self.num_choices)
config = GPT2Config(
vocab_size_or_config_json_file=self.vocab_size,
n_embd=self.hidden_size,
n_layer=self.num_hidden_layers,
n_head=self.num_attention_heads,
# intermediate_size=self.intermediate_size,
# hidden_act=self.hidden_act,
# hidden_dropout_prob=self.hidden_dropout_prob,
# attention_probs_dropout_prob=self.attention_probs_dropout_prob,
n_positions=self.max_position_embeddings,
n_ctx=self.max_position_embeddings
# type_vocab_size=self.type_vocab_size,
# initializer_range=self.initializer_range
)
head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
return config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels
def check_loss_output(self, result):
self.parent.assertListEqual(
list(result["loss"].size()),
[])
def create_and_check_gpt2_model(self, config, input_ids, head_mask, token_type_ids, *args):
model = GPT2Model(config=config)
model.eval()
model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
model(input_ids, token_type_ids=token_type_ids)
sequence_output, presents = model(input_ids)
result = {
"sequence_output": sequence_output,
"presents": presents,
}
self.parent.assertListEqual(
list(result["sequence_output"].size()),
[self.batch_size, self.seq_length, self.hidden_size])
self.parent.assertEqual(len(result["presents"]), config.n_layer)
def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
model = GPT2LMHeadModel(config)
model.eval()
loss, lm_logits, _ = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
result = {
"loss": loss,
"lm_logits": lm_logits
}
self.parent.assertListEqual(
list(result["loss"].size()),
[])
self.parent.assertListEqual(
list(result["lm_logits"].size()),
[self.batch_size, self.seq_length, self.vocab_size])
def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
model = GPT2DoubleHeadsModel(config)
model.eval()
loss, lm_logits, mc_logits, _ = model(input_ids, token_type_ids=token_type_ids, lm_labels=input_ids)
result = {
"loss": loss,
"lm_logits": lm_logits
}
self.parent.assertListEqual(
list(result["loss"].size()),
[])
self.parent.assertListEqual(
list(result["lm_logits"].size()),
[self.batch_size, self.seq_length, self.vocab_size])
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
inputs_dict = {
'input_ids': input_ids,
'token_type_ids': token_type_ids,
'head_mask': head_mask
}
return config, inputs_dict
def setUp(self):
self.model_tester = GPT2ModelTest.GPT2ModelTester(self)
self.config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37)
def test_config(self): def test_config(self):
config_tester = ConfigTester(self, config_class=GPT2Config, n_embd=37) self.config_tester.run_common_tests()
config_tester.run_common_tests()
def test_model(self): def test_gpt2_model(self):
model_tester = CommonTestCases.GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model, config_and_inputs = self.model_tester.prepare_config_and_inputs()
lm_head_model_class=GPT2LMHeadModel, self.model_tester.create_and_check_gpt2_model(*config_and_inputs)
double_head_model_class=GPT2DoubleHeadsModel)
model_tester.run_common_tests(test_presents=True) def test_gpt2_lm_head_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
def test_gpt2_double_lm_head_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
@pytest.mark.slow @pytest.mark.slow
def test_pretrained(self): def test_model_from_pretrained(self):
model_tester = CommonTestCases.GPTModelTester(self, config_class=GPT2Config, base_model_class=GPT2Model, cache_dir = "/tmp/pytorch_transformers_test/"
lm_head_model_class=GPT2LMHeadModel, for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
double_head_model_class=GPT2DoubleHeadsModel) model = GPT2Model.from_pretrained(model_name, cache_dir=cache_dir)
model_tester.run_slow_tests() shutil.rmtree(cache_dir)
self.assertIsNotNone(model)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -18,31 +18,195 @@ from __future__ import print_function ...@@ -18,31 +18,195 @@ from __future__ import print_function
import unittest import unittest
import pytest import pytest
import shutil
from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel, from pytorch_transformers import (OpenAIGPTConfig, OpenAIGPTModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel) OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
from .modeling_common_test import CommonTestCases, ConfigTester from .modeling_common_test import (CommonTestCases, ids_tensor)
from .configuration_common_test import ConfigTester
class OpenAIModelTest(unittest.TestCase):
class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
all_model_classes = (OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel)
class OpenAIGPTModelTester(object):
def __init__(self,
parent,
batch_size=13,
seq_length=7,
is_training=True,
use_token_type_ids=True,
use_labels=True,
vocab_size=99,
hidden_size=32,
num_hidden_layers=5,
num_attention_heads=4,
intermediate_size=37,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=16,
type_sequence_label_size=2,
initializer_range=0.02,
num_labels=3,
num_choices=4,
scope=None,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.is_training = is_training
self.use_token_type_ids = use_token_type_ids
self.use_labels = use_labels
self.vocab_size = vocab_size
self.hidden_size = hidden_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.intermediate_size = intermediate_size
self.hidden_act = hidden_act
self.hidden_dropout_prob = hidden_dropout_prob
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.max_position_embeddings = max_position_embeddings
self.type_vocab_size = type_vocab_size
self.type_sequence_label_size = type_sequence_label_size
self.initializer_range = initializer_range
self.num_labels = num_labels
self.num_choices = num_choices
self.scope = scope
def prepare_config_and_inputs(self):
input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
token_type_ids = None
if self.use_token_type_ids:
token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
sequence_labels = None
token_labels = None
choice_labels = None
if self.use_labels:
sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
choice_labels = ids_tensor([self.batch_size], self.num_choices)
config = OpenAIGPTConfig(
vocab_size_or_config_json_file=self.vocab_size,
n_embd=self.hidden_size,
n_layer=self.num_hidden_layers,
n_head=self.num_attention_heads,
# intermediate_size=self.intermediate_size,
# hidden_act=self.hidden_act,
# hidden_dropout_prob=self.hidden_dropout_prob,
# attention_probs_dropout_prob=self.attention_probs_dropout_prob,
n_positions=self.max_position_embeddings,
n_ctx=self.max_position_embeddings
# type_vocab_size=self.type_vocab_size,
# initializer_range=self.initializer_range
)
head_mask = ids_tensor([self.num_hidden_layers, self.num_attention_heads], 2)
return config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels
def check_loss_output(self, result):
self.parent.assertListEqual(
list(result["loss"].size()),
[])
def create_and_check_openai_gpt_model(self, config, input_ids, head_mask, token_type_ids, *args):
model = OpenAIGPTModel(config=config)
model.eval()
model(input_ids, token_type_ids=token_type_ids, head_mask=head_mask)
model(input_ids, token_type_ids=token_type_ids)
(sequence_output,) = model(input_ids)
result = {
"sequence_output": sequence_output
}
self.parent.assertListEqual(
list(result["sequence_output"].size()),
[self.batch_size, self.seq_length, self.hidden_size])
def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
model = OpenAIGPTLMHeadModel(config)
model.eval()
loss, lm_logits = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
result = {
"loss": loss,
"lm_logits": lm_logits
}
self.parent.assertListEqual(
list(result["loss"].size()),
[])
self.parent.assertListEqual(
list(result["lm_logits"].size()),
[self.batch_size, self.seq_length, self.vocab_size])
def create_and_check_double_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
model = OpenAIGPTDoubleHeadsModel(config)
model.eval()
loss, lm_logits, mc_logits = model(input_ids, token_type_ids=token_type_ids, lm_labels=input_ids)
result = {
"loss": loss,
"lm_logits": lm_logits
}
self.parent.assertListEqual(
list(result["loss"].size()),
[])
self.parent.assertListEqual(
list(result["lm_logits"].size()),
[self.batch_size, self.seq_length, self.vocab_size])
def prepare_config_and_inputs_for_common(self):
config_and_inputs = self.prepare_config_and_inputs()
(config, input_ids, head_mask, token_type_ids, sequence_labels, token_labels, choice_labels) = config_and_inputs
inputs_dict = {
'input_ids': input_ids,
'token_type_ids': token_type_ids,
'head_mask': head_mask
}
return config, inputs_dict
def setUp(self):
self.model_tester = OpenAIGPTModelTest.OpenAIGPTModelTester(self)
self.config_tester = ConfigTester(self, config_class=OpenAIGPTConfig, n_embd=37)
def test_config(self): def test_config(self):
config_tester = ConfigTester(self, config_class=OpenAIGPTConfig, n_embd=37) self.config_tester.run_common_tests()
config_tester.run_common_tests()
def test_model(self): def test_openai_gpt_model(self):
model_tester = CommonTestCases.GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel, config_and_inputs = self.model_tester.prepare_config_and_inputs()
lm_head_model_class=OpenAIGPTLMHeadModel, self.model_tester.create_and_check_openai_gpt_model(*config_and_inputs)
double_head_model_class=OpenAIGPTDoubleHeadsModel)
model_tester.run_common_tests(test_presents=False) def test_openai_gpt_lm_head_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_lm_head_model(*config_and_inputs)
def test_openai_gpt_double_lm_head_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_double_lm_head_model(*config_and_inputs)
@pytest.mark.slow @pytest.mark.slow
def test_pretrained(self): def test_model_from_pretrained(self):
model_tester = CommonTestCases.GPTModelTester(self, config_class=OpenAIGPTConfig, base_model_class=OpenAIGPTModel, cache_dir = "/tmp/pytorch_transformers_test/"
lm_head_model_class=OpenAIGPTLMHeadModel, for model_name in list(OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
double_head_model_class=OpenAIGPTDoubleHeadsModel) model = OpenAIGPTModel.from_pretrained(model_name, cache_dir=cache_dir)
model_tester.run_slow_tests() shutil.rmtree(cache_dir)
self.assertIsNotNone(model)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -24,7 +24,8 @@ import torch ...@@ -24,7 +24,8 @@ import torch
from pytorch_transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification) from pytorch_transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification)
from pytorch_transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP from pytorch_transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor) from .modeling_common_test import (CommonTestCases, ids_tensor)
from .configuration_common_test import ConfigTester
class RobertaModelTest(CommonTestCases.CommonModelTester): class RobertaModelTest(CommonTestCases.CommonModelTester):
...@@ -123,8 +124,8 @@ class RobertaModelTest(CommonTestCases.CommonModelTester): ...@@ -123,8 +124,8 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
token_labels, choice_labels): token_labels, choice_labels):
model = RobertaModel(config=config) model = RobertaModel(config=config)
model.eval() model.eval()
sequence_output, pooled_output = model(input_ids, token_type_ids, input_mask) sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
sequence_output, pooled_output = model(input_ids, token_type_ids) sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
sequence_output, pooled_output = model(input_ids) sequence_output, pooled_output = model(input_ids)
result = { result = {
...@@ -140,7 +141,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester): ...@@ -140,7 +141,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
token_labels, choice_labels): token_labels, choice_labels):
model = RobertaForMaskedLM(config=config) model = RobertaForMaskedLM(config=config)
model.eval() model.eval()
loss, prediction_scores = model(input_ids, token_type_ids, input_mask, token_labels) loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
result = { result = {
"loss": loss, "loss": loss,
"prediction_scores": prediction_scores, "prediction_scores": prediction_scores,
......
...@@ -16,9 +16,7 @@ from __future__ import absolute_import ...@@ -16,9 +16,7 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import os
import unittest import unittest
import json
import random import random
import shutil import shutil
import pytest import pytest
...@@ -28,7 +26,8 @@ import torch ...@@ -28,7 +26,8 @@ import torch
from pytorch_transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel) from pytorch_transformers import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel)
from pytorch_transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP from pytorch_transformers.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_common_test import ConfigTester, CommonTestCases, ids_tensor from .modeling_common_test import (CommonTestCases, ids_tensor)
from .configuration_common_test import ConfigTester
class TransfoXLModelTest(CommonTestCases.CommonModelTester): class TransfoXLModelTest(CommonTestCases.CommonModelTester):
......
...@@ -23,7 +23,8 @@ import pytest ...@@ -23,7 +23,8 @@ import pytest
from pytorch_transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification) from pytorch_transformers import (XLMConfig, XLMModel, XLMWithLMHeadModel, XLMForQuestionAnswering, XLMForSequenceClassification)
from pytorch_transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP from pytorch_transformers.modeling_xlm import XLM_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor) from .modeling_common_test import (CommonTestCases, ids_tensor)
from .configuration_common_test import ConfigTester
class XLMModelTest(CommonTestCases.CommonModelTester): class XLMModelTest(CommonTestCases.CommonModelTester):
......
...@@ -28,7 +28,8 @@ import torch ...@@ -28,7 +28,8 @@ import torch
from pytorch_transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering) from pytorch_transformers import (XLNetConfig, XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering)
from pytorch_transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP from pytorch_transformers.modeling_xlnet import XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
from .modeling_common_test import ConfigTester, CommonTestCases, ids_tensor from .modeling_common_test import (CommonTestCases, ids_tensor)
from .configuration_common_test import ConfigTester
class XLNetModelTest(CommonTestCases.CommonModelTester): class XLNetModelTest(CommonTestCases.CommonModelTester):
......
...@@ -41,8 +41,8 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester): ...@@ -41,8 +41,8 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer: with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_tokenizer(self): def get_tokenizer(self, **kwargs):
return self.tokenizer_class.from_pretrained(self.tmpdirname) return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
def get_input_output_texts(self): def get_input_output_texts(self):
input_text = u"UNwant\u00E9d,running" input_text = u"UNwant\u00E9d,running"
......
...@@ -27,8 +27,8 @@ class DistilBertTokenizationTest(BertTokenizationTest): ...@@ -27,8 +27,8 @@ class DistilBertTokenizationTest(BertTokenizationTest):
tokenizer_class = DistilBertTokenizer tokenizer_class = DistilBertTokenizer
def get_tokenizer(self): def get_tokenizer(self, **kwargs):
return DistilBertTokenizer.from_pretrained(self.tmpdirname) return DistilBertTokenizer.from_pretrained(self.tmpdirname, **kwargs)
def test_sequence_builders(self): def test_sequence_builders(self):
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased") tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
......
...@@ -17,6 +17,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera ...@@ -17,6 +17,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import os import os
import unittest import unittest
import json import json
from io import open
from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES
...@@ -31,36 +32,38 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester): ...@@ -31,36 +32,38 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
"lo", "low", "er", "\u0120", "\u0120l", "\u0120n",
"low", "lowest", "newer", "wider", "<unk>"] "\u0120lo", "\u0120low", "er",
"\u0120lowest", "\u0120newer", "\u0120wider", "<unk>"]
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "l o", "lo w", "e r", ""] merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
self.special_tokens_map = {"unk_token": "<unk>"} self.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file']) self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file']) self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
with open(self.vocab_file, "w") as fp: with open(self.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens)) fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w") as fp: with open(self.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges)) fp.write("\n".join(merges))
def get_tokenizer(self): def get_tokenizer(self, **kwargs):
return GPT2Tokenizer.from_pretrained(self.tmpdirname, **self.special_tokens_map) kwargs.update(self.special_tokens_map)
return GPT2Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
def get_input_output_texts(self): def get_input_output_texts(self):
input_text = u"lower newer" input_text = u"lower newer"
output_text = u"lower<unk>newer" output_text = u" lower newer"
return input_text, output_text return input_text, output_text
def test_full_tokenizer(self): def test_full_tokenizer(self):
tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map) tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
text = "lower" text = "lower newer"
bpe_tokens = ["low", "er"] bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
tokens = tokenizer.tokenize(text) tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, bpe_tokens) self.assertListEqual(tokens, bpe_tokens)
input_tokens = tokens + [tokenizer.unk_token] input_tokens = tokens + [tokenizer.unk_token]
input_bpe_tokens = [13, 12, 17] input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
self.assertListEqual( self.assertListEqual(
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
......
...@@ -45,8 +45,8 @@ class OpenAIGPTTokenizationTest(CommonTestCases.CommonTokenizerTester): ...@@ -45,8 +45,8 @@ class OpenAIGPTTokenizationTest(CommonTestCases.CommonTokenizerTester):
with open(self.merges_file, "w") as fp: with open(self.merges_file, "w") as fp:
fp.write("\n".join(merges)) fp.write("\n".join(merges))
def get_tokenizer(self): def get_tokenizer(self, **kwargs):
return OpenAIGPTTokenizer.from_pretrained(self.tmpdirname) return OpenAIGPTTokenizer.from_pretrained(self.tmpdirname, **kwargs)
def get_input_output_texts(self): def get_input_output_texts(self):
input_text = u"lower newer" input_text = u"lower newer"
......
...@@ -17,6 +17,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera ...@@ -17,6 +17,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
import os import os
import json import json
import unittest import unittest
from io import open
from pytorch_transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES from pytorch_transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES
from .tokenization_tests_commons import CommonTestCases from .tokenization_tests_commons import CommonTestCases
...@@ -30,36 +31,38 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester): ...@@ -30,36 +31,38 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
# Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n", vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
"lo", "low", "er", "\u0120", "\u0120l", "\u0120n",
"low", "lowest", "newer", "wider", "<unk>"] "\u0120lo", "\u0120low", "er",
"\u0120lowest", "\u0120newer", "\u0120wider", "<unk>"]
vocab_tokens = dict(zip(vocab, range(len(vocab)))) vocab_tokens = dict(zip(vocab, range(len(vocab))))
merges = ["#version: 0.2", "l o", "lo w", "e r", ""] merges = ["#version: 0.2", "\u0120 l", "\u0120l o", "\u0120lo w", "e r", ""]
self.special_tokens_map = {"unk_token": "<unk>"} self.special_tokens_map = {"unk_token": "<unk>"}
self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file']) self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file']) self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
with open(self.vocab_file, "w") as fp: with open(self.vocab_file, "w", encoding="utf-8") as fp:
fp.write(json.dumps(vocab_tokens)) fp.write(json.dumps(vocab_tokens) + "\n")
with open(self.merges_file, "w") as fp: with open(self.merges_file, "w", encoding="utf-8") as fp:
fp.write("\n".join(merges)) fp.write("\n".join(merges))
def get_tokenizer(self): def get_tokenizer(self, **kwargs):
return RobertaTokenizer.from_pretrained(self.tmpdirname, **self.special_tokens_map) kwargs.update(self.special_tokens_map)
return RobertaTokenizer.from_pretrained(self.tmpdirname, **kwargs)
def get_input_output_texts(self): def get_input_output_texts(self):
input_text = u"lower newer" input_text = u"lower newer"
output_text = u"lower<unk>newer" output_text = u" lower newer"
return input_text, output_text return input_text, output_text
def test_full_tokenizer(self): def test_full_tokenizer(self):
tokenizer = RobertaTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map) tokenizer = RobertaTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
text = "lower" text = "lower newer"
bpe_tokens = ["low", "er"] bpe_tokens = ["\u0120low", "er", "\u0120", "n", "e", "w", "er"]
tokens = tokenizer.tokenize(text) tokens = tokenizer.tokenize(text)
self.assertListEqual(tokens, bpe_tokens) self.assertListEqual(tokens, bpe_tokens)
input_tokens = tokens + [tokenizer.unk_token] input_tokens = tokens + [tokenizer.unk_token]
input_bpe_tokens = [13, 12, 17] input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
self.assertListEqual( self.assertListEqual(
tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
......
...@@ -49,23 +49,48 @@ class CommonTestCases: ...@@ -49,23 +49,48 @@ class CommonTestCases:
def tearDown(self): def tearDown(self):
shutil.rmtree(self.tmpdirname) shutil.rmtree(self.tmpdirname)
def get_tokenizer(self): def get_tokenizer(self, **kwargs):
raise NotImplementedError raise NotImplementedError
def get_input_output_texts(self): def get_input_output_texts(self):
raise NotImplementedError raise NotImplementedError
def test_tokenizers_common_properties(self):
tokenizer = self.get_tokenizer()
attributes_list = ["bos_token", "eos_token", "unk_token", "sep_token",
"pad_token", "cls_token", "mask_token"]
for attr in attributes_list:
self.assertTrue(hasattr(tokenizer, attr))
self.assertTrue(hasattr(tokenizer, attr + "_id"))
self.assertTrue(hasattr(tokenizer, "additional_special_tokens"))
self.assertTrue(hasattr(tokenizer, 'additional_special_tokens_ids'))
attributes_list = ["max_len", "init_inputs", "init_kwargs", "added_tokens_encoder",
"added_tokens_decoder"]
for attr in attributes_list:
self.assertTrue(hasattr(tokenizer, attr))
def test_save_and_load_tokenizer(self): def test_save_and_load_tokenizer(self):
# safety check on max_len default value so we are sure the test works
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
self.assertNotEqual(tokenizer.max_len, 42)
# Now let's start the test
tokenizer = self.get_tokenizer(max_len=42)
before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running") before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
with TemporaryDirectory() as tmpdirname: with TemporaryDirectory() as tmpdirname:
tokenizer.save_pretrained(tmpdirname) tokenizer.save_pretrained(tmpdirname)
tokenizer = tokenizer.from_pretrained(tmpdirname) tokenizer = self.tokenizer_class.from_pretrained(tmpdirname)
after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
self.assertListEqual(before_tokens, after_tokens)
after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running") self.assertEqual(tokenizer.max_len, 42)
self.assertListEqual(before_tokens, after_tokens) tokenizer = self.tokenizer_class.from_pretrained(tmpdirname, max_len=43)
self.assertEqual(tokenizer.max_len, 43)
def test_pickle_tokenizer(self): def test_pickle_tokenizer(self):
tokenizer = self.get_tokenizer() tokenizer = self.get_tokenizer()
...@@ -95,7 +120,7 @@ class CommonTestCases: ...@@ -95,7 +120,7 @@ class CommonTestCases:
self.assertNotEqual(vocab_size, 0) self.assertNotEqual(vocab_size, 0)
self.assertEqual(vocab_size, all_size) self.assertEqual(vocab_size, all_size)
new_toks = ["aaaaabbbbbb", "cccccccccdddddddd"] new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"]
added_toks = tokenizer.add_tokens(new_toks) added_toks = tokenizer.add_tokens(new_toks)
vocab_size_2 = tokenizer.vocab_size vocab_size_2 = tokenizer.vocab_size
all_size_2 = len(tokenizer) all_size_2 = len(tokenizer)
...@@ -105,13 +130,15 @@ class CommonTestCases: ...@@ -105,13 +130,15 @@ class CommonTestCases:
self.assertEqual(added_toks, len(new_toks)) self.assertEqual(added_toks, len(new_toks))
self.assertEqual(all_size_2, all_size + len(new_toks)) self.assertEqual(all_size_2, all_size + len(new_toks))
tokens = tokenizer.encode("aaaaabbbbbb low cccccccccdddddddd l") tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l")
out_string = tokenizer.decode(tokens)
self.assertGreaterEqual(len(tokens), 4) self.assertGreaterEqual(len(tokens), 4)
self.assertGreater(tokens[0], tokenizer.vocab_size - 1) self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1) self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
new_toks_2 = {'eos_token': ">>>>|||<||<<|<<", new_toks_2 = {'eos_token': ">>>>|||<||<<|<<",
'pad_token': "<<<<<|||>|>>>>|>"} 'pad_token': "<<<<<|||>|>>>>|>"}
added_toks_2 = tokenizer.add_special_tokens(new_toks_2) added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
vocab_size_3 = tokenizer.vocab_size vocab_size_3 = tokenizer.vocab_size
all_size_3 = len(tokenizer) all_size_3 = len(tokenizer)
...@@ -122,14 +149,15 @@ class CommonTestCases: ...@@ -122,14 +149,15 @@ class CommonTestCases:
self.assertEqual(all_size_3, all_size_2 + len(new_toks_2)) self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l") tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l")
out_string = tokenizer.decode(tokens)
self.assertGreaterEqual(len(tokens), 6) self.assertGreaterEqual(len(tokens), 6)
self.assertGreater(tokens[0], tokenizer.vocab_size - 1) self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
self.assertGreater(tokens[0], tokens[1]) self.assertGreater(tokens[0], tokens[1])
self.assertGreater(tokens[-2], tokenizer.vocab_size - 1) self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
self.assertGreater(tokens[-2], tokens[-3]) self.assertGreater(tokens[-2], tokens[-3])
self.assertEqual(tokens[0], tokenizer.convert_tokens_to_ids(tokenizer.eos_token)) self.assertEqual(tokens[0], tokenizer.eos_token_id)
self.assertEqual(tokens[-2], tokenizer.convert_tokens_to_ids(tokenizer.pad_token)) self.assertEqual(tokens[-2], tokenizer.pad_token_id)
def test_required_methods_tokenizer(self): def test_required_methods_tokenizer(self):
......
...@@ -37,8 +37,9 @@ class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester): ...@@ -37,8 +37,9 @@ class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester):
with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer: with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
def get_tokenizer(self): def get_tokenizer(self, **kwargs):
return TransfoXLTokenizer.from_pretrained(self.tmpdirname, lower_case=True) kwargs['lower_case'] = True
return TransfoXLTokenizer.from_pretrained(self.tmpdirname, **kwargs)
def get_input_output_texts(self): def get_input_output_texts(self):
input_text = u"<unk> UNwanted , running" input_text = u"<unk> UNwanted , running"
......
...@@ -44,8 +44,8 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester): ...@@ -44,8 +44,8 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
with open(self.merges_file, "w") as fp: with open(self.merges_file, "w") as fp:
fp.write("\n".join(merges)) fp.write("\n".join(merges))
def get_tokenizer(self): def get_tokenizer(self, **kwargs):
return XLMTokenizer.from_pretrained(self.tmpdirname) return XLMTokenizer.from_pretrained(self.tmpdirname, **kwargs)
def get_input_output_texts(self): def get_input_output_texts(self):
input_text = u"lower newer" input_text = u"lower newer"
......
...@@ -35,8 +35,8 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester): ...@@ -35,8 +35,8 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(self.tmpdirname)
def get_tokenizer(self): def get_tokenizer(self, **kwargs):
return XLNetTokenizer.from_pretrained(self.tmpdirname) return XLNetTokenizer.from_pretrained(self.tmpdirname, **kwargs)
def get_input_output_texts(self): def get_input_output_texts(self):
input_text = u"This is a test" input_text = u"This is a test"
......
...@@ -25,6 +25,7 @@ from .tokenization_transfo_xl import TransfoXLTokenizer ...@@ -25,6 +25,7 @@ from .tokenization_transfo_xl import TransfoXLTokenizer
from .tokenization_xlnet import XLNetTokenizer from .tokenization_xlnet import XLNetTokenizer
from .tokenization_xlm import XLMTokenizer from .tokenization_xlm import XLMTokenizer
from .tokenization_roberta import RobertaTokenizer from .tokenization_roberta import RobertaTokenizer
from .tokenization_distilbert import DistilBertTokenizer
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -39,13 +40,14 @@ class AutoTokenizer(object): ...@@ -39,13 +40,14 @@ class AutoTokenizer(object):
The tokenizer class to instantiate is selected as the first pattern matching The tokenizer class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order): in the `pretrained_model_name_or_path` string (in the following order):
- contains `distilbert`: DistilBertTokenizer (DistilBert model)
- contains `roberta`: RobertaTokenizer (RoBERTa model)
- contains `bert`: BertTokenizer (Bert model) - contains `bert`: BertTokenizer (Bert model)
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model) - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model) - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
- contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model) - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
- contains `xlnet`: XLNetTokenizer (XLNet model) - contains `xlnet`: XLNetTokenizer (XLNet model)
- contains `xlm`: XLMTokenizer (XLM model) - contains `xlm`: XLMTokenizer (XLM model)
- contains `roberta`: RobertaTokenizer (RoBERTa model)
This class cannot be instantiated using `__init__()` (throw an error). This class cannot be instantiated using `__init__()` (throw an error).
""" """
...@@ -60,32 +62,45 @@ class AutoTokenizer(object): ...@@ -60,32 +62,45 @@ class AutoTokenizer(object):
The tokenizer class to instantiate is selected as the first pattern matching The tokenizer class to instantiate is selected as the first pattern matching
in the `pretrained_model_name_or_path` string (in the following order): in the `pretrained_model_name_or_path` string (in the following order):
- contains `distilbert`: DistilBertTokenizer (DistilBert model)
- contains `roberta`: RobertaTokenizer (XLM model)
- contains `bert`: BertTokenizer (Bert model) - contains `bert`: BertTokenizer (Bert model)
- contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model) - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
- contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model) - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
- contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model) - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
- contains `xlnet`: XLNetTokenizer (XLNet model) - contains `xlnet`: XLNetTokenizer (XLNet model)
- contains `xlm`: XLMTokenizer (XLM model) - contains `xlm`: XLMTokenizer (XLM model)
- contains `roberta`: RobertaTokenizer (XLM model)
Params: Params:
**pretrained_model_name_or_path**: either: pretrained_model_name_or_path: either:
- a string with the `shortcut name` of a pre-trained model configuration to load from cache
or download and cache if not already stored in cache (e.g. 'bert-base-uncased'). - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
- a path to a `directory` containing a configuration file saved - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
using the `save_pretrained(save_directory)` method. - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
- a path or url to a saved configuration `file`.
**cache_dir**: (`optional`) string: cache_dir: (`optional`) string:
Path to a directory in which a downloaded pre-trained model Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
configuration should be cached if the standard cache should not be used.
force_download: (`optional`) boolean, default False:
Force to (re-)download the vocabulary files and override the cached versions if they exists.
proxies: (`optional`) dict, default None:
A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
The proxies are used on each request.
inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.
Examples:: Examples::
config = AutoTokenizer.from_pretrained('bert-base-uncased') # Download vocabulary from S3 and cache. tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') # Download vocabulary from S3 and cache.
config = AutoTokenizer.from_pretrained('./test/bert_saved_model/') # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')` tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/') # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
""" """
if 'roberta' in pretrained_model_name_or_path: if 'distilbert' in pretrained_model_name_or_path:
return DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'roberta' in pretrained_model_name_or_path:
return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
elif 'bert' in pretrained_model_name_or_path: elif 'bert' in pretrained_model_name_or_path:
return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
......
...@@ -63,6 +63,23 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { ...@@ -63,6 +63,23 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
'bert-base-cased-finetuned-mrpc': 512, 'bert-base-cased-finetuned-mrpc': 512,
} }
PRETRAINED_INIT_CONFIGURATION = {
'bert-base-uncased': {'do_lower_case': True},
'bert-large-uncased': {'do_lower_case': True},
'bert-base-cased': {'do_lower_case': False},
'bert-large-cased': {'do_lower_case': False},
'bert-base-multilingual-uncased': {'do_lower_case': True},
'bert-base-multilingual-cased': {'do_lower_case': False},
'bert-base-chinese': {'do_lower_case': False},
'bert-base-german-cased': {'do_lower_case': False},
'bert-large-uncased-whole-word-masking': {'do_lower_case': True},
'bert-large-cased-whole-word-masking': {'do_lower_case': False},
'bert-large-uncased-whole-word-masking-finetuned-squad': {'do_lower_case': True},
'bert-large-cased-whole-word-masking-finetuned-squad': {'do_lower_case': False},
'bert-base-cased-finetuned-mrpc': {'do_lower_case': False},
}
def load_vocab(vocab_file): def load_vocab(vocab_file):
"""Loads a vocabulary file into a dictionary.""" """Loads a vocabulary file into a dictionary."""
vocab = collections.OrderedDict() vocab = collections.OrderedDict()
...@@ -100,6 +117,7 @@ class BertTokenizer(PreTrainedTokenizer): ...@@ -100,6 +117,7 @@ class BertTokenizer(PreTrainedTokenizer):
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
def __init__(self, vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None, def __init__(self, vocab_file, do_lower_case=True, do_basic_tokenize=True, never_split=None,
...@@ -174,15 +192,15 @@ class BertTokenizer(PreTrainedTokenizer): ...@@ -174,15 +192,15 @@ class BertTokenizer(PreTrainedTokenizer):
Adds special tokens to the a sequence for sequence classification tasks. Adds special tokens to the a sequence for sequence classification tasks.
A BERT sequence has the following format: [CLS] X [SEP] A BERT sequence has the following format: [CLS] X [SEP]
""" """
return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)] return [self.cls_token_id] + token_ids + [self.sep_token_id]
def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1): def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
""" """
Adds special tokens to a sequence pair for sequence classification tasks. Adds special tokens to a sequence pair for sequence classification tasks.
A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP] A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP]
""" """
sep = [self._convert_token_to_id(self.sep_token)] sep = [self.sep_token_id]
cls = [self._convert_token_to_id(self.cls_token)] cls = [self.cls_token_id]
return cls + token_ids_0 + sep + token_ids_1 + sep return cls + token_ids_0 + sep + token_ids_1 + sep
def save_vocabulary(self, vocab_path): def save_vocabulary(self, vocab_path):
...@@ -202,24 +220,6 @@ class BertTokenizer(PreTrainedTokenizer): ...@@ -202,24 +220,6 @@ class BertTokenizer(PreTrainedTokenizer):
index += 1 index += 1
return (vocab_file,) return (vocab_file,)
@classmethod
def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
""" Instantiate a BertTokenizer from pre-trained vocabulary files.
"""
if pretrained_model_name_or_path in PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES:
if '-cased' in pretrained_model_name_or_path and kwargs.get('do_lower_case', True):
logger.warning("The pre-trained model you are loading is a cased model but you have not set "
"`do_lower_case` to False. We are setting `do_lower_case=False` for you but "
"you may want to check this behavior.")
kwargs['do_lower_case'] = False
elif '-cased' not in pretrained_model_name_or_path and not kwargs.get('do_lower_case', True):
logger.warning("The pre-trained model you are loading is an uncased model but you have set "
"`do_lower_case` to False. We are setting `do_lower_case=True` for you "
"but you may want to check this behavior.")
kwargs['do_lower_case'] = True
return super(BertTokenizer, cls)._from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
class BasicTokenizer(object): class BasicTokenizer(object):
"""Runs basic tokenization (punctuation splitting, lower casing, etc.).""" """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment