Commit d5481cbe authored by thomwolf's avatar thomwolf
Browse files

adding tests to examples - updating summary module - coverage update

parent c079d7dd
[run] [run]
source=pytorch_transformers source=pytorch_transformers
omit =
# skip convertion scripts from testing for now
*/convert_*
[report] [report]
exclude_lines = exclude_lines =
pragma: no cover pragma: no cover
......
...@@ -126,4 +126,5 @@ models ...@@ -126,4 +126,5 @@ models
proc_data proc_data
# examples # examples
runs
examples/runs examples/runs
\ No newline at end of file
...@@ -60,25 +60,14 @@ TOKENIZER_CLASSES = { ...@@ -60,25 +60,14 @@ TOKENIZER_CLASSES = {
'xlm': XLMTokenizer, 'xlm': XLMTokenizer,
} }
def train(args, train_features, model): def train(args, train_dataset, model):
""" Train the model """ """ Train the model """
if args.local_rank in [-1, 0]: if args.local_rank in [-1, 0]:
tb_writer = SummaryWriter() tb_writer = SummaryWriter()
# Convert in tensors and build dataloader
all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
if args.output_mode == "classification":
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
elif args.output_mode == "regression":
all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)
args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
train_sampler = RandomSampler(train_data) if args.local_rank == -1 else DistributedSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
...@@ -109,19 +98,24 @@ def train(args, train_features, model): ...@@ -109,19 +98,24 @@ def train(args, train_features, model):
# Train! # Train!
logger.info("***** Running training *****") logger.info("***** Running training *****")
logger.info(" Num examples = %d", len(train_features)) logger.info(" Num examples = %d", len(train_dataset))
logger.info(" Num Epochs = %d", args.num_train_epochs)
logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Batch size = %d", args.train_batch_size)
logger.info(" Num steps = %d", num_train_optimization_steps) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
logger.info(" Total optimization steps = %d", num_train_optimization_steps)
global_step = 0 global_step = 0
tr_loss = 0 tr_loss = 0
model.train() model.train()
optimizer.zero_grad()
for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]): for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]):
for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])): for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
batch = tuple(t.to(args.device) for t in batch) batch = tuple(t.to(args.device) for t in batch)
input_ids, input_mask, segment_ids, label_ids = batch inputs = {'input_ids': batch[0],
'attention_mask': batch[1],
ouputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids) 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,
'labels': batch[3]}
ouputs = model(**inputs)
loss = ouputs[0] loss = ouputs[0]
if args.n_gpu > 1: if args.n_gpu > 1:
...@@ -150,30 +144,20 @@ def train(args, train_features, model): ...@@ -150,30 +144,20 @@ def train(args, train_features, model):
return global_step, tr_loss / global_step return global_step, tr_loss / global_step
def evalutate(args, eval_task, eval_output_dir, eval_features, model): def evalutate(args, eval_task, eval_output_dir, dataset, model):
""" Evaluate the model """ """ Evaluate the model """
if os.path.exists(eval_output_dir) and os.listdir(eval_output_dir) and args.do_train and not args.overwrite_output_dir: if os.path.exists(eval_output_dir) and os.listdir(eval_output_dir) and args.do_train and not args.overwrite_output_dir:
raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(eval_output_dir)) raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(eval_output_dir))
if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
os.makedirs(eval_output_dir) os.makedirs(eval_output_dir)
# Convert in tensors and build dataloader
all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
if args.output_mode == "classification":
all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
elif args.output_mode == "regression":
all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float)
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
# Note that DistributedSampler samples randomly # Note that DistributedSampler samples randomly
eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data) eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
# Eval! # Eval!
logger.info("***** Running evaluation *****") logger.info("***** Running evaluation *****")
logger.info(" Num examples = %d", len(eval_features)) logger.info(" Num examples = %d", len(dataset))
logger.info(" Batch size = %d", args.eval_batch_size) logger.info(" Batch size = %d", args.eval_batch_size)
model.eval() model.eval()
eval_loss = 0 eval_loss = 0
...@@ -214,36 +198,47 @@ def evalutate(args, eval_task, eval_output_dir, eval_features, model): ...@@ -214,36 +198,47 @@ def evalutate(args, eval_task, eval_output_dir, eval_features, model):
logger.info(" %s = %s", key, str(result[key])) logger.info(" %s = %s", key, str(result[key]))
writer.write("%s = %s\n" % (key, str(result[key]))) writer.write("%s = %s\n" % (key, str(result[key])))
return result
def load_and_cache_examples(args, task, tokenizer, eval=False):
processor = processors[task]()
output_mode = output_modes[task]
label_list = processor.get_labels()
# Load and cache data def load_and_cache_examples(args, task, tokenizer, evaluate=False):
processor = processors[task]() processor = processors[task]()
examples = processor.get_dev_examples(args.data_dir) output_mode = output_modes[task]
cached_features_file = os.path.join(args.data_dir, '{}_{}_{}_{}'.format( # Load data features from cache or dataset file
'dev' if eval else 'train', cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
'dev' if evaluate else 'train',
list(filter(None, args.model_name.split('/'))).pop(), list(filter(None, args.model_name.split('/'))).pop(),
str(args.max_seq_length), str(args.max_seq_length),
str(task))) str(task)))
if os.path.exists(cached_features_file): if os.path.exists(cached_features_file):
logger.info("Loading features from cached file %s", cached_features_file) logger.info("Loading features from cached file %s", cached_features_file)
features = torch.load(cached_features_file) features = torch.load(cached_features_file)
else: else:
features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("Creating features from dataset file at %s", args.data_dir)
label_list = processor.get_labels()
examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode, features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode,
cls_token_at_end=bool(args.model_type not in ['bert', 'xlm']), cls_token_at_end=bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end
cls_token=tokenizer.cls_token, cls_token=tokenizer.cls_token,
sep_token=tokenizer.sep_token, cls_token_segment_id=2, sep_token=tokenizer.sep_token,
pad_on_left=True, pad_token_segment_id=4) cls_token_segment_id=2 if args.model_type in ['xlnet'] else 1,
if args.local_rank == -1 or torch.distributed.get_rank() == 0: pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet
pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0)
if args.local_rank in [-1, 0]:
logger.info("Saving features into cached file %s", cached_features_file) logger.info("Saving features into cached file %s", cached_features_file)
torch.save(features, cached_features_file) torch.save(features, cached_features_file)
return features # Convert to Tensors and build dataset
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
if output_mode == "classification":
all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
elif output_mode == "regression":
all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
return dataset
def main(): def main():
...@@ -350,10 +345,10 @@ def main(): ...@@ -350,10 +345,10 @@ def main():
torch.distributed.barrier() torch.distributed.barrier()
args.model_type = args.model_name.lower().split('-')[0] args.model_type = args.model_name.lower().split('-')[0]
args.tokenizer_class = TOKENIZER_CLASSES[args.model_type] tokenizer_class = TOKENIZER_CLASSES[args.model_type]
args.model_class = MODEL_CLASSES[args.model_type] model_class = MODEL_CLASSES[args.model_type]
tokenizer = args.tokenizer_class.from_pretrained(args.model_name, do_lower_case=args.do_lower_case) tokenizer = tokenizer_class.from_pretrained(args.model_name, do_lower_case=args.do_lower_case)
model = args.model_class.from_pretrained(args.model_name, num_labels=num_labels) model = model_class.from_pretrained(args.model_name, num_labels=num_labels)
if args.local_rank == 0: if args.local_rank == 0:
torch.distributed.barrier() torch.distributed.barrier()
...@@ -372,23 +367,30 @@ def main(): ...@@ -372,23 +367,30 @@ def main():
# Training # Training
if args.do_train: if args.do_train:
train_features = load_and_cache_examples(args, args.task_name, tokenizer, eval=False) train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
global_step, tr_loss = train(args, train_features, model) global_step, tr_loss = train(args, train_dataset, model)
logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
# Save a trained model, configuration and tokenizer # Create output directory if needed
if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
os.makedirs(args.output_dir)
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model.save_pretrained(args.output_dir) model.save_pretrained(args.output_dir)
tokenizer.save_vocabulary(args.output_dir) tokenizer.save_pretrained(args.output_dir)
# Good practice: save your training arguments together with the trained model # Good practice: save your training arguments together with the trained model
torch.save(args, os.path.join(args.output_dir, 'training_args.bin')) torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
# Load a trained model and vocabulary that you have fine-tuned # Load a trained model and vocabulary that you have fine-tuned
model = args.model_class.from_pretrained(args.output_dir) model = model_class.from_pretrained(args.output_dir)
tokenizer = args.tokenizer_class.from_pretrained(args.output_dir) tokenizer = tokenizer_class.from_pretrained(args.output_dir)
model.to(args.device) model.to(args.device)
# Evaluation # Evaluation
...@@ -398,9 +400,11 @@ def main(): ...@@ -398,9 +400,11 @@ def main():
eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,) eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)
for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
eval_features = load_and_cache_examples(args, eval_task, tokenizer, eval=True) eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
result = evalutate(args, eval_task, eval_output_dir, eval_dataset, model)
evalutate(args, eval_task, eval_output_dir, eval_features, model) return result
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -19,6 +19,7 @@ from __future__ import print_function ...@@ -19,6 +19,7 @@ from __future__ import print_function
import sys import sys
import unittest import unittest
import argparse import argparse
import logging
try: try:
# python 3.4+ can use builtin unittest.mock instead of mock package # python 3.4+ can use builtin unittest.mock instead of mock package
...@@ -26,7 +27,11 @@ try: ...@@ -26,7 +27,11 @@ try:
except ImportError: except ImportError:
from mock import patch from mock import patch
import run_bert_squad as rbs import run_glue
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger()
def get_setup_file(): def get_setup_file():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
...@@ -36,12 +41,18 @@ def get_setup_file(): ...@@ -36,12 +41,18 @@ def get_setup_file():
class ExamplesTests(unittest.TestCase): class ExamplesTests(unittest.TestCase):
def test_run_squad(self): def test_run_glue(self):
testargs = ["prog", "-f", "/home/test/setup.py"] stream_handler = logging.StreamHandler(sys.stdout)
with patch.object(sys, 'argv', testargs): logger.addHandler(stream_handler)
setup = get_setup_file()
assert setup == "/home/test/setup.py" testargs = ["run_glue.py", "--data_dir=./examples/tests_samples/MRPC/",
# rbs.main() "--task_name=mrpc", "--do_train", "--do_eval", "--output_dir=./examples/tests_samples/temp_dir",
"--train_batch_size=4", "--eval_batch_size=2", "--num_train_epochs=2.0", "--overwrite_output_dir"]
model_name = "--model_name=xlnet-large-cased"
with patch.object(sys, 'argv', testargs + [model_name]):
result = run_glue.main()
for value in result.values():
self.assertGreaterEqual(value, 0.75)
if __name__ == "__main__": if __name__ == "__main__":
......
*.*
cache*
temp*
!*.tsv
!.gitignore
\ No newline at end of file
Quality #1 ID #2 ID #1 String #2 String
1 1355540 1355592 He said the foodservice pie business doesn 't fit the company 's long-term growth strategy . " The foodservice pie business does not fit our long-term growth strategy .
0 2029631 2029565 Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war . His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war .
0 487993 487952 The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat . The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
1 1989515 1989458 The AFL-CIO is waiting until October to decide if it will endorse a candidate . The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
0 1783137 1782659 No dates have been set for the civil or the criminal trial . No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty .
1 3039165 3039036 Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed . It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
Quality #1 ID #2 ID #1 String #2 String
1 1355540 1355592 He said the foodservice pie business doesn 't fit the company 's long-term growth strategy . " The foodservice pie business does not fit our long-term growth strategy .
0 2029631 2029565 Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war . His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war .
0 487993 487952 The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat . The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
1 1989515 1989458 The AFL-CIO is waiting until October to decide if it will endorse a candidate . The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
0 1783137 1782659 No dates have been set for the civil or the criminal trial . No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty .
1 3039165 3039036 Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed . It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
...@@ -28,7 +28,6 @@ import torch ...@@ -28,7 +28,6 @@ import torch
from torch import nn from torch import nn
from torch.nn import CrossEntropyLoss, MSELoss from torch.nn import CrossEntropyLoss, MSELoss
from .file_utils import cached_path
from .modeling_utils import WEIGHTS_NAME, CONFIG_NAME, PretrainedConfig, PreTrainedModel, prune_linear_layer from .modeling_utils import WEIGHTS_NAME, CONFIG_NAME, PretrainedConfig, PreTrainedModel, prune_linear_layer
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
......
...@@ -30,7 +30,6 @@ import torch.nn as nn ...@@ -30,7 +30,6 @@ import torch.nn as nn
from torch.nn import CrossEntropyLoss from torch.nn import CrossEntropyLoss
from torch.nn.parameter import Parameter from torch.nn.parameter import Parameter
from .file_utils import cached_path
from .modeling_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, from .modeling_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig,
PreTrainedModel, prune_conv1d_layer, SequenceSummary) PreTrainedModel, prune_conv1d_layer, SequenceSummary)
from .modeling_bert import BertLayerNorm as LayerNorm from .modeling_bert import BertLayerNorm as LayerNorm
...@@ -122,9 +121,8 @@ class GPT2Config(PretrainedConfig): ...@@ -122,9 +121,8 @@ class GPT2Config(PretrainedConfig):
predict_special_tokens=True, predict_special_tokens=True,
summary_type='token_ids', summary_type='token_ids',
summary_use_proj=True, summary_use_proj=True,
summary_num_classes=1,
summary_activation=None, summary_activation=None,
summary_dropout=0.1, summary_first_dropout=0.1,
**kwargs **kwargs
): ):
"""Constructs GPT2Config. """Constructs GPT2Config.
...@@ -172,9 +170,8 @@ class GPT2Config(PretrainedConfig): ...@@ -172,9 +170,8 @@ class GPT2Config(PretrainedConfig):
self.predict_special_tokens = predict_special_tokens self.predict_special_tokens = predict_special_tokens
self.summary_type = summary_type self.summary_type = summary_type
self.summary_use_proj = summary_use_proj self.summary_use_proj = summary_use_proj
self.summary_num_classes = summary_num_classes
self.summary_activation = summary_activation self.summary_activation = summary_activation
self.summary_dropout = summary_dropout self.summary_first_dropout = summary_first_dropout
else: else:
raise ValueError( raise ValueError(
"First argument must be either a vocabulary size (int)" "First argument must be either a vocabulary size (int)"
......
...@@ -30,9 +30,8 @@ import torch.nn as nn ...@@ -30,9 +30,8 @@ import torch.nn as nn
from torch.nn import CrossEntropyLoss from torch.nn import CrossEntropyLoss
from torch.nn.parameter import Parameter from torch.nn.parameter import Parameter
from .file_utils import cached_path
from .modeling_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, from .modeling_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig,
PreTrainedModel, prune_conv1d_layer, SequenceSummary) PreTrainedModel, prune_conv1d_layer, SequenceSummary)
from .modeling_bert import BertLayerNorm as LayerNorm from .modeling_bert import BertLayerNorm as LayerNorm
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -150,9 +149,8 @@ class OpenAIGPTConfig(PretrainedConfig): ...@@ -150,9 +149,8 @@ class OpenAIGPTConfig(PretrainedConfig):
predict_special_tokens=True, predict_special_tokens=True,
summary_type='token_ids', summary_type='token_ids',
summary_use_proj=True, summary_use_proj=True,
summary_num_classes=1,
summary_activation=None, summary_activation=None,
summary_dropout=0.1, summary_first_dropout=0.1,
**kwargs **kwargs
): ):
"""Constructs OpenAIGPTConfig. """Constructs OpenAIGPTConfig.
...@@ -203,9 +201,8 @@ class OpenAIGPTConfig(PretrainedConfig): ...@@ -203,9 +201,8 @@ class OpenAIGPTConfig(PretrainedConfig):
self.predict_special_tokens = predict_special_tokens self.predict_special_tokens = predict_special_tokens
self.summary_type = summary_type self.summary_type = summary_type
self.summary_use_proj = summary_use_proj self.summary_use_proj = summary_use_proj
self.summary_num_classes = summary_num_classes
self.summary_activation = summary_activation self.summary_activation = summary_activation
self.summary_dropout = summary_dropout self.summary_first_dropout = summary_first_dropout
else: else:
raise ValueError( raise ValueError(
"First argument must be either a vocabulary size (int)" "First argument must be either a vocabulary size (int)"
......
...@@ -36,7 +36,6 @@ from torch.nn.parameter import Parameter ...@@ -36,7 +36,6 @@ from torch.nn.parameter import Parameter
from .modeling_bert import BertLayerNorm as LayerNorm from .modeling_bert import BertLayerNorm as LayerNorm
from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits
from .file_utils import cached_path
from .modeling_utils import CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel from .modeling_utils import CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
......
...@@ -25,7 +25,7 @@ from io import open ...@@ -25,7 +25,7 @@ from io import open
import torch import torch
from torch import nn from torch import nn
from torch.nn import CrossEntropyLoss, MSELoss, functional as F from torch.nn import CrossEntropyLoss, functional as F
from .file_utils import cached_path from .file_utils import cached_path
...@@ -514,10 +514,10 @@ class SequenceSummary(nn.Module): ...@@ -514,10 +514,10 @@ class SequenceSummary(nn.Module):
- 'token_ids' => supply a Tensor of classification token indices (GPT/GPT-2) - 'token_ids' => supply a Tensor of classification token indices (GPT/GPT-2)
- 'attn' => Not implemented now, use multi-head attention - 'attn' => Not implemented now, use multi-head attention
summary_use_proj: Add a projection after the vector extraction summary_use_proj: Add a projection after the vector extraction
summary_num_classes: If > 0: the projection outputs to n classes (otherwise to hidden_size) summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
summary_activation: summary_activation: 'tanh' => add a tanh activation to the output, Other => no activation. Default
'tanh' => add a tanh activation to the output summary_first_dropout: Add a dropout before the projection and activation
None => no activation summary_last_dropout: Add a dropout after the projection and activation
""" """
def __init__(self, config): def __init__(self, config):
super(SequenceSummary, self).__init__() super(SequenceSummary, self).__init__()
...@@ -531,8 +531,8 @@ class SequenceSummary(nn.Module): ...@@ -531,8 +531,8 @@ class SequenceSummary(nn.Module):
self.summary = nn.Identity() self.summary = nn.Identity()
if hasattr(config, 'summary_use_proj') and config.summary_use_proj: if hasattr(config, 'summary_use_proj') and config.summary_use_proj:
if hasattr(config, 'summary_num_classes') and config.summary_num_classes > 0: if hasattr(config, 'summary_proj_to_labels') and config.summary_proj_to_labels and config.num_labels > 0:
num_classes = config.summary_num_classes num_classes = config.num_labels
else: else:
num_classes = config.hidden_size num_classes = config.hidden_size
self.summary = nn.Linear(config.hidden_size, num_classes) self.summary = nn.Linear(config.hidden_size, num_classes)
...@@ -541,7 +541,13 @@ class SequenceSummary(nn.Module): ...@@ -541,7 +541,13 @@ class SequenceSummary(nn.Module):
if hasattr(config, 'summary_activation') and config.summary_activation == 'tanh': if hasattr(config, 'summary_activation') and config.summary_activation == 'tanh':
self.activation = nn.Tanh() self.activation = nn.Tanh()
self.dropout = nn.Dropout(config.summary_dropout) self.first_dropout = nn.Identity()
if hasattr(config, 'summary_first_dropout') and config.summary_first_dropout > 0:
self.first_dropout = nn.Dropout(config.summary_first_dropout)
self.last_dropout = nn.Identity()
if hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0:
self.last_dropout = nn.Dropout(config.summary_last_dropout)
def forward(self, hidden_states, token_ids=None): def forward(self, hidden_states, token_ids=None):
""" hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer. """ hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
...@@ -567,9 +573,10 @@ class SequenceSummary(nn.Module): ...@@ -567,9 +573,10 @@ class SequenceSummary(nn.Module):
elif self.summary_type == 'attn': elif self.summary_type == 'attn':
raise NotImplementedError raise NotImplementedError
output = self.first_dropout(output)
output = self.summary(output) output = self.summary(output)
output = self.activation(output) output = self.activation(output)
output = self.dropout(output) output = self.last_dropout(output)
return output return output
......
...@@ -14,18 +14,14 @@ ...@@ -14,18 +14,14 @@
# limitations under the License. # limitations under the License.
""" PyTorch XLM model. """ PyTorch XLM model.
""" """
from __future__ import (absolute_import, division, print_function,
unicode_literals)
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
import json import json
import logging import logging
import math import math
import os
import sys import sys
from io import open from io import open
import math
import itertools import itertools
import numpy as np import numpy as np
...@@ -34,9 +30,8 @@ from torch import nn ...@@ -34,9 +30,8 @@ from torch import nn
from torch.nn import functional as F from torch.nn import functional as F
from torch.nn import CrossEntropyLoss, MSELoss from torch.nn import CrossEntropyLoss, MSELoss
from .file_utils import cached_path from .modeling_utils import (PretrainedConfig, PreTrainedModel,
from .modeling_utils import (CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel, prune_linear_layer, SequenceSummary, SQuADHead)
prune_linear_layer, SequenceSummary, SQuADHead)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -79,10 +74,11 @@ class XLMConfig(PretrainedConfig): ...@@ -79,10 +74,11 @@ class XLMConfig(PretrainedConfig):
finetuning_task=None, finetuning_task=None,
num_labels=2, num_labels=2,
summary_type='last', summary_type='first',
summary_use_proj=True, summary_use_proj=True,
summary_activation='tanh', summary_activation=None,
summary_dropout=0.1, summary_proj_to_labels=True,
summary_first_dropout=0.1,
start_n_top=5, start_n_top=5,
end_n_top=5, end_n_top=5,
**kwargs): **kwargs):
...@@ -164,7 +160,8 @@ class XLMConfig(PretrainedConfig): ...@@ -164,7 +160,8 @@ class XLMConfig(PretrainedConfig):
self.summary_type = summary_type self.summary_type = summary_type
self.summary_use_proj = summary_use_proj self.summary_use_proj = summary_use_proj
self.summary_activation = summary_activation self.summary_activation = summary_activation
self.summary_dropout = summary_dropout self.summary_proj_to_labels = summary_proj_to_labels
self.summary_first_dropout = summary_first_dropout
self.start_n_top = start_n_top self.start_n_top = start_n_top
self.end_n_top = end_n_top self.end_n_top = end_n_top
else: else:
......
...@@ -31,9 +31,8 @@ from torch import nn ...@@ -31,9 +31,8 @@ from torch import nn
from torch.nn import functional as F from torch.nn import functional as F
from torch.nn import CrossEntropyLoss, MSELoss from torch.nn import CrossEntropyLoss, MSELoss
from .file_utils import cached_path
from .modeling_utils import (CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel, from .modeling_utils import (CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig, PreTrainedModel,
SequenceSummary, PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits) SequenceSummary, PoolerAnswerClass, PoolerEndLogits, PoolerStartLogits)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -227,7 +226,7 @@ class XLNetConfig(PretrainedConfig): ...@@ -227,7 +226,7 @@ class XLNetConfig(PretrainedConfig):
summary_type='last', summary_type='last',
summary_use_proj=True, summary_use_proj=True,
summary_activation='tanh', summary_activation='tanh',
summary_dropout=0.1, summary_last_dropout=0.1,
start_n_top=5, start_n_top=5,
end_n_top=5, end_n_top=5,
**kwargs): **kwargs):
...@@ -314,7 +313,7 @@ class XLNetConfig(PretrainedConfig): ...@@ -314,7 +313,7 @@ class XLNetConfig(PretrainedConfig):
self.summary_type = summary_type self.summary_type = summary_type
self.summary_use_proj = summary_use_proj self.summary_use_proj = summary_use_proj
self.summary_activation = summary_activation self.summary_activation = summary_activation
self.summary_dropout = summary_dropout self.summary_last_dropout = summary_last_dropout
self.start_n_top = start_n_top self.start_n_top = start_n_top
self.end_n_top = end_n_top self.end_n_top = end_n_top
else: else:
......
...@@ -113,8 +113,6 @@ class BertTokenizer(PreTrainedTokenizer): ...@@ -113,8 +113,6 @@ class BertTokenizer(PreTrainedTokenizer):
raise ValueError( raise ValueError(
"Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
"model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)) "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
if never_split is None:
never_split = self.all_special_tokens
self.vocab = load_vocab(vocab_file) self.vocab = load_vocab(vocab_file)
self.ids_to_tokens = collections.OrderedDict( self.ids_to_tokens = collections.OrderedDict(
[(ids, tok) for tok, ids in self.vocab.items()]) [(ids, tok) for tok, ids in self.vocab.items()])
......
...@@ -142,11 +142,7 @@ class PreTrainedTokenizer(object): ...@@ -142,11 +142,7 @@ class PreTrainedTokenizer(object):
self.added_tokens_decoder = {} self.added_tokens_decoder = {}
for key, value in kwargs.items(): for key, value in kwargs.items():
if key not in self.SPECIAL_TOKENS_ATTRIBUTES: if key in self.SPECIAL_TOKENS_ATTRIBUTES:
raise ValueError(
"PreTrainedTokenizer.__init__() argument {} should be in {}".format(
key, ', '.join(self.SPECIAL_TOKENS_ATTRIBUTES)))
else:
setattr(self, key, value) setattr(self, key, value)
......
...@@ -20,13 +20,9 @@ import json ...@@ -20,13 +20,9 @@ import json
import logging import logging
import os import os
import re import re
import sys
from io import open from io import open
from tqdm import tqdm from .tokenization_utils import PreTrainedTokenizer
from .file_utils import cached_path
from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
from .tokenization_bert import BasicTokenizer from .tokenization_bert import BasicTokenizer
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment