Commit 0f091062 authored by thomwolf's avatar thomwolf
Browse files

Merge branch 'glue-example' into tf2

parents c4acc3a8 e4022d96
from argparse import ArgumentParser
from pathlib import Path
import os
import torch
import logging
import json
import random
import numpy as np
from collections import namedtuple
from tempfile import TemporaryDirectory
from torch.utils.data import DataLoader, Dataset, RandomSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm
from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
from pytorch_transformers.modeling_bert import BertForPreTraining
from pytorch_transformers.tokenization_bert import BertTokenizer
from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule
InputFeatures = namedtuple("InputFeatures", "input_ids input_mask segment_ids lm_label_ids is_next")
log_format = '%(asctime)-10s: %(message)s'
logging.basicConfig(level=logging.INFO, format=log_format)
def convert_example_to_features(example, tokenizer, max_seq_length):
tokens = example["tokens"]
segment_ids = example["segment_ids"]
is_random_next = example["is_random_next"]
masked_lm_positions = example["masked_lm_positions"]
masked_lm_labels = example["masked_lm_labels"]
assert len(tokens) == len(segment_ids) <= max_seq_length # The preprocessed data should be already truncated
input_ids = tokenizer.convert_tokens_to_ids(tokens)
masked_label_ids = tokenizer.convert_tokens_to_ids(masked_lm_labels)
input_array = np.zeros(max_seq_length, dtype=np.int)
input_array[:len(input_ids)] = input_ids
mask_array = np.zeros(max_seq_length, dtype=np.bool)
mask_array[:len(input_ids)] = 1
segment_array = np.zeros(max_seq_length, dtype=np.bool)
segment_array[:len(segment_ids)] = segment_ids
lm_label_array = np.full(max_seq_length, dtype=np.int, fill_value=-1)
lm_label_array[masked_lm_positions] = masked_label_ids
features = InputFeatures(input_ids=input_array,
input_mask=mask_array,
segment_ids=segment_array,
lm_label_ids=lm_label_array,
is_next=is_random_next)
return features
class PregeneratedDataset(Dataset):
def __init__(self, training_path, epoch, tokenizer, num_data_epochs, reduce_memory=False):
self.vocab = tokenizer.vocab
self.tokenizer = tokenizer
self.epoch = epoch
self.data_epoch = epoch % num_data_epochs
data_file = training_path / f"epoch_{self.data_epoch}.json"
metrics_file = training_path / f"epoch_{self.data_epoch}_metrics.json"
assert data_file.is_file() and metrics_file.is_file()
metrics = json.loads(metrics_file.read_text())
num_samples = metrics['num_training_examples']
seq_len = metrics['max_seq_len']
self.temp_dir = None
self.working_dir = None
if reduce_memory:
self.temp_dir = TemporaryDirectory()
self.working_dir = Path(self.temp_dir.name)
input_ids = np.memmap(filename=self.working_dir/'input_ids.memmap',
mode='w+', dtype=np.int32, shape=(num_samples, seq_len))
input_masks = np.memmap(filename=self.working_dir/'input_masks.memmap',
shape=(num_samples, seq_len), mode='w+', dtype=np.bool)
segment_ids = np.memmap(filename=self.working_dir/'segment_ids.memmap',
shape=(num_samples, seq_len), mode='w+', dtype=np.bool)
lm_label_ids = np.memmap(filename=self.working_dir/'lm_label_ids.memmap',
shape=(num_samples, seq_len), mode='w+', dtype=np.int32)
lm_label_ids[:] = -1
is_nexts = np.memmap(filename=self.working_dir/'is_nexts.memmap',
shape=(num_samples,), mode='w+', dtype=np.bool)
else:
input_ids = np.zeros(shape=(num_samples, seq_len), dtype=np.int32)
input_masks = np.zeros(shape=(num_samples, seq_len), dtype=np.bool)
segment_ids = np.zeros(shape=(num_samples, seq_len), dtype=np.bool)
lm_label_ids = np.full(shape=(num_samples, seq_len), dtype=np.int32, fill_value=-1)
is_nexts = np.zeros(shape=(num_samples,), dtype=np.bool)
logging.info(f"Loading training examples for epoch {epoch}")
with data_file.open() as f:
for i, line in enumerate(tqdm(f, total=num_samples, desc="Training examples")):
line = line.strip()
example = json.loads(line)
features = convert_example_to_features(example, tokenizer, seq_len)
input_ids[i] = features.input_ids
segment_ids[i] = features.segment_ids
input_masks[i] = features.input_mask
lm_label_ids[i] = features.lm_label_ids
is_nexts[i] = features.is_next
assert i == num_samples - 1 # Assert that the sample count metric was true
logging.info("Loading complete!")
self.num_samples = num_samples
self.seq_len = seq_len
self.input_ids = input_ids
self.input_masks = input_masks
self.segment_ids = segment_ids
self.lm_label_ids = lm_label_ids
self.is_nexts = is_nexts
def __len__(self):
return self.num_samples
def __getitem__(self, item):
return (torch.tensor(self.input_ids[item].astype(np.int64)),
torch.tensor(self.input_masks[item].astype(np.int64)),
torch.tensor(self.segment_ids[item].astype(np.int64)),
torch.tensor(self.lm_label_ids[item].astype(np.int64)),
torch.tensor(self.is_nexts[item].astype(np.int64)))
def main():
parser = ArgumentParser()
parser.add_argument('--pregenerated_data', type=Path, required=True)
parser.add_argument('--output_dir', type=Path, required=True)
parser.add_argument("--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, "
"bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
parser.add_argument("--do_lower_case", action="store_true")
parser.add_argument("--reduce_memory", action="store_true",
help="Store training data as on-disc memmaps to massively reduce memory usage")
parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for")
parser.add_argument("--local_rank",
type=int,
default=-1,
help="local_rank for distributed training on gpus")
parser.add_argument("--no_cuda",
action='store_true',
help="Whether not to use CUDA when available")
parser.add_argument('--gradient_accumulation_steps',
type=int,
default=1,
help="Number of updates steps to accumulate before performing a backward/update pass.")
parser.add_argument("--train_batch_size",
default=32,
type=int,
help="Total batch size for training.")
parser.add_argument('--fp16',
action='store_true',
help="Whether to use 16-bit float precision instead of 32-bit")
parser.add_argument('--loss_scale',
type=float, default=0,
help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
"0 (default value): dynamic loss scaling.\n"
"Positive power of 2: static loss scaling value.\n")
parser.add_argument("--warmup_steps",
default=0,
type=int,
help="Linear warmup over warmup_steps.")
parser.add_argument("--adam_epsilon",
default=1e-8,
type=float,
help="Epsilon for Adam optimizer.")
parser.add_argument("--learning_rate",
default=3e-5,
type=float,
help="The initial learning rate for Adam.")
parser.add_argument('--seed',
type=int,
default=42,
help="random seed for initialization")
args = parser.parse_args()
assert args.pregenerated_data.is_dir(), \
"--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!"
samples_per_epoch = []
for i in range(args.epochs):
epoch_file = args.pregenerated_data / f"epoch_{i}.json"
metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json"
if epoch_file.is_file() and metrics_file.is_file():
metrics = json.loads(metrics_file.read_text())
samples_per_epoch.append(metrics['num_training_examples'])
else:
if i == 0:
exit("No training data was found!")
print(f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs}).")
print("This script will loop over the available data, but training diversity may be negatively impacted.")
num_data_epochs = i
break
else:
num_data_epochs = args.epochs
if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
n_gpu = torch.cuda.device_count()
else:
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank)
n_gpu = 1
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.distributed.init_process_group(backend='nccl')
logging.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
device, n_gpu, bool(args.local_rank != -1), args.fp16))
if args.gradient_accumulation_steps < 1:
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
args.gradient_accumulation_steps))
args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if n_gpu > 0:
torch.cuda.manual_seed_all(args.seed)
if args.output_dir.is_dir() and list(args.output_dir.iterdir()):
logging.warning(f"Output directory ({args.output_dir}) already exists and is not empty!")
args.output_dir.mkdir(parents=True, exist_ok=True)
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
total_train_examples = 0
for i in range(args.epochs):
# The modulo takes into account the fact that we may loop over limited epochs of data
total_train_examples += samples_per_epoch[i % len(samples_per_epoch)]
num_train_optimization_steps = int(
total_train_examples / args.train_batch_size / args.gradient_accumulation_steps)
if args.local_rank != -1:
num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
# Prepare model
model = BertForPreTraining.from_pretrained(args.bert_model)
# We don't need to manually call model.half() following Apex's recommend
# if args.fp16:
# model.half()
model.to(device)
if args.local_rank != -1:
try:
from apex.parallel import DistributedDataParallel as DDP
except ImportError:
raise ImportError(
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
model = DDP(model)
elif n_gpu > 1:
model = torch.nn.DataParallel(model)
# Prepare optimizer
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps,
t_total=num_train_optimization_steps)
if args.fp16:
try:
# from apex.optimizers import FP16_Optimizer
# from apex.optimizers import FusedAdam
from apex import amp
except ImportError:
raise ImportError(
"Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
# This below line of code is the main upgrade of Apex Fp16 implementation. I chose opt_leve="01"
# because it's recommended for typical use by Apex. We can make it configured
model, optimizer = amp.initialize(model, optimizer, opt_level="O1")
# We don't need to use FP16_Optimizer wrapping over FusedAdam as well. Now Apex supports all Pytorch Optimizer
# optimizer = FusedAdam(optimizer_grouped_parameters,
# lr=args.learning_rate,
# bias_correction=False,
# max_grad_norm=1.0)
# if args.loss_scale == 0:
# optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
# else:
# optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
# else:
# optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
# scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps)
global_step = 0
logging.info("***** Running training *****")
logging.info(f" Num examples = {total_train_examples}")
logging.info(" Batch size = %d", args.train_batch_size)
logging.info(" Num steps = %d", num_train_optimization_steps)
model.train()
for epoch in range(args.epochs):
epoch_dataset = PregeneratedDataset(epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer,
num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory)
if args.local_rank == -1:
train_sampler = RandomSampler(epoch_dataset)
else:
train_sampler = DistributedSampler(epoch_dataset)
train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
tr_loss = 0
nb_tr_examples, nb_tr_steps = 0, 0
with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar:
for step, batch in enumerate(train_dataloader):
batch = tuple(t.to(device) for t in batch)
input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
outputs = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next)
loss = outputs[0]
if n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu.
if args.gradient_accumulation_steps > 1:
loss = loss / args.gradient_accumulation_steps
if args.fp16:
# I depricate FP16_Optimizer's backward func and replace as Apex document
# optimizer.backward(loss)
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
else:
loss.backward()
tr_loss += loss.item()
nb_tr_examples += input_ids.size(0)
nb_tr_steps += 1
pbar.update(1)
mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps
pbar.set_postfix_str(f"Loss: {mean_loss:.5f}")
if (step + 1) % args.gradient_accumulation_steps == 0:
optimizer.step()
scheduler.step() # Update learning rate schedule
optimizer.zero_grad()
global_step += 1
# Save a trained model
if args.local_rank == -1 or torch.distributed.get_rank() == 0:
logging.info("** ** * Saving fine-tuned model ** ** * ")
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
model_to_save.save_pretrained(args.output_dir)
tokenizer.save_pretrained(args.output_dir)
if __name__ == '__main__':
main()
from argparse import ArgumentParser
from pathlib import Path
from tqdm import tqdm, trange
from tempfile import TemporaryDirectory
import shelve
from multiprocessing import Pool
from random import random, randrange, randint, shuffle, choice
from pytorch_transformers.tokenization_bert import BertTokenizer
import numpy as np
import json
import collections
class DocumentDatabase:
def __init__(self, reduce_memory=False):
if reduce_memory:
self.temp_dir = TemporaryDirectory()
self.working_dir = Path(self.temp_dir.name)
self.document_shelf_filepath = self.working_dir / 'shelf.db'
self.document_shelf = shelve.open(str(self.document_shelf_filepath),
flag='n', protocol=-1)
self.documents = None
else:
self.documents = []
self.document_shelf = None
self.document_shelf_filepath = None
self.temp_dir = None
self.doc_lengths = []
self.doc_cumsum = None
self.cumsum_max = None
self.reduce_memory = reduce_memory
def add_document(self, document):
if not document:
return
if self.reduce_memory:
current_idx = len(self.doc_lengths)
self.document_shelf[str(current_idx)] = document
else:
self.documents.append(document)
self.doc_lengths.append(len(document))
def _precalculate_doc_weights(self):
self.doc_cumsum = np.cumsum(self.doc_lengths)
self.cumsum_max = self.doc_cumsum[-1]
def sample_doc(self, current_idx, sentence_weighted=True):
# Uses the current iteration counter to ensure we don't sample the same doc twice
if sentence_weighted:
# With sentence weighting, we sample docs proportionally to their sentence length
if self.doc_cumsum is None or len(self.doc_cumsum) != len(self.doc_lengths):
self._precalculate_doc_weights()
rand_start = self.doc_cumsum[current_idx]
rand_end = rand_start + self.cumsum_max - self.doc_lengths[current_idx]
sentence_index = randrange(rand_start, rand_end) % self.cumsum_max
sampled_doc_index = np.searchsorted(self.doc_cumsum, sentence_index, side='right')
else:
# If we don't use sentence weighting, then every doc has an equal chance to be chosen
sampled_doc_index = (current_idx + randrange(1, len(self.doc_lengths))) % len(self.doc_lengths)
assert sampled_doc_index != current_idx
if self.reduce_memory:
return self.document_shelf[str(sampled_doc_index)]
else:
return self.documents[sampled_doc_index]
def __len__(self):
return len(self.doc_lengths)
def __getitem__(self, item):
if self.reduce_memory:
return self.document_shelf[str(item)]
else:
return self.documents[item]
def __enter__(self):
return self
def __exit__(self, exc_type, exc_val, traceback):
if self.document_shelf is not None:
self.document_shelf.close()
if self.temp_dir is not None:
self.temp_dir.cleanup()
def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens):
"""Truncates a pair of sequences to a maximum sequence length. Lifted from Google's BERT repo."""
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_num_tokens:
break
trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
assert len(trunc_tokens) >= 1
# We want to sometimes truncate from the front and sometimes from the
# back to add more randomness and avoid biases.
if random() < 0.5:
del trunc_tokens[0]
else:
trunc_tokens.pop()
MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
["index", "label"])
def create_masked_lm_predictions(tokens, masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list):
"""Creates the predictions for the masked LM objective. This is mostly copied from the Google BERT repo, but
with several refactors to clean it up and remove a lot of unnecessary variables."""
cand_indices = []
for (i, token) in enumerate(tokens):
if token == "[CLS]" or token == "[SEP]":
continue
# Whole Word Masking means that if we mask all of the wordpieces
# corresponding to an original word. When a word has been split into
# WordPieces, the first token does not have any marker and any subsequence
# tokens are prefixed with ##. So whenever we see the ## token, we
# append it to the previous set of word indexes.
#
# Note that Whole Word Masking does *not* change the training code
# at all -- we still predict each WordPiece independently, softmaxed
# over the entire vocabulary.
if (whole_word_mask and len(cand_indices) >= 1 and token.startswith("##")):
cand_indices[-1].append(i)
else:
cand_indices.append([i])
num_to_mask = min(max_predictions_per_seq,
max(1, int(round(len(tokens) * masked_lm_prob))))
shuffle(cand_indices)
masked_lms = []
covered_indexes = set()
for index_set in cand_indices:
if len(masked_lms) >= num_to_mask:
break
# If adding a whole-word mask would exceed the maximum number of
# predictions, then just skip this candidate.
if len(masked_lms) + len(index_set) > num_to_mask:
continue
is_any_index_covered = False
for index in index_set:
if index in covered_indexes:
is_any_index_covered = True
break
if is_any_index_covered:
continue
for index in index_set:
covered_indexes.add(index)
masked_token = None
# 80% of the time, replace with [MASK]
if random() < 0.8:
masked_token = "[MASK]"
else:
# 10% of the time, keep original
if random() < 0.5:
masked_token = tokens[index]
# 10% of the time, replace with random word
else:
masked_token = choice(vocab_list)
masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
tokens[index] = masked_token
assert len(masked_lms) <= num_to_mask
masked_lms = sorted(masked_lms, key=lambda x: x.index)
mask_indices = [p.index for p in masked_lms]
masked_token_labels = [p.label for p in masked_lms]
return tokens, mask_indices, masked_token_labels
def create_instances_from_document(
doc_database, doc_idx, max_seq_length, short_seq_prob,
masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list):
"""This code is mostly a duplicate of the equivalent function from Google BERT's repo.
However, we make some changes and improvements. Sampling is improved and no longer requires a loop in this function.
Also, documents are sampled proportionally to the number of sentences they contain, which means each sentence
(rather than each document) has an equal chance of being sampled as a false example for the NextSentence task."""
document = doc_database[doc_idx]
# Account for [CLS], [SEP], [SEP]
max_num_tokens = max_seq_length - 3
# We *usually* want to fill up the entire sequence since we are padding
# to `max_seq_length` anyways, so short sequences are generally wasted
# computation. However, we *sometimes*
# (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
# sequences to minimize the mismatch between pre-training and fine-tuning.
# The `target_seq_length` is just a rough target however, whereas
# `max_seq_length` is a hard limit.
target_seq_length = max_num_tokens
if random() < short_seq_prob:
target_seq_length = randint(2, max_num_tokens)
# We DON'T just concatenate all of the tokens from a document into a long
# sequence and choose an arbitrary split point because this would make the
# next sentence prediction task too easy. Instead, we split the input into
# segments "A" and "B" based on the actual "sentences" provided by the user
# input.
instances = []
current_chunk = []
current_length = 0
i = 0
while i < len(document):
segment = document[i]
current_chunk.append(segment)
current_length += len(segment)
if i == len(document) - 1 or current_length >= target_seq_length:
if current_chunk:
# `a_end` is how many segments from `current_chunk` go into the `A`
# (first) sentence.
a_end = 1
if len(current_chunk) >= 2:
a_end = randrange(1, len(current_chunk))
tokens_a = []
for j in range(a_end):
tokens_a.extend(current_chunk[j])
tokens_b = []
# Random next
if len(current_chunk) == 1 or random() < 0.5:
is_random_next = True
target_b_length = target_seq_length - len(tokens_a)
# Sample a random document, with longer docs being sampled more frequently
random_document = doc_database.sample_doc(current_idx=doc_idx, sentence_weighted=True)
random_start = randrange(0, len(random_document))
for j in range(random_start, len(random_document)):
tokens_b.extend(random_document[j])
if len(tokens_b) >= target_b_length:
break
# We didn't actually use these segments so we "put them back" so
# they don't go to waste.
num_unused_segments = len(current_chunk) - a_end
i -= num_unused_segments
# Actual next
else:
is_random_next = False
for j in range(a_end, len(current_chunk)):
tokens_b.extend(current_chunk[j])
truncate_seq_pair(tokens_a, tokens_b, max_num_tokens)
assert len(tokens_a) >= 1
assert len(tokens_b) >= 1
tokens = ["[CLS]"] + tokens_a + ["[SEP]"] + tokens_b + ["[SEP]"]
# The segment IDs are 0 for the [CLS] token, the A tokens and the first [SEP]
# They are 1 for the B tokens and the final [SEP]
segment_ids = [0 for _ in range(len(tokens_a) + 2)] + [1 for _ in range(len(tokens_b) + 1)]
tokens, masked_lm_positions, masked_lm_labels = create_masked_lm_predictions(
tokens, masked_lm_prob, max_predictions_per_seq, whole_word_mask, vocab_list)
instance = {
"tokens": tokens,
"segment_ids": segment_ids,
"is_random_next": is_random_next,
"masked_lm_positions": masked_lm_positions,
"masked_lm_labels": masked_lm_labels}
instances.append(instance)
current_chunk = []
current_length = 0
i += 1
return instances
def create_training_file(docs, vocab_list, args, epoch_num):
epoch_filename = args.output_dir / "epoch_{}.json".format(epoch_num)
num_instances = 0
with epoch_filename.open('w') as epoch_file:
for doc_idx in trange(len(docs), desc="Document"):
doc_instances = create_instances_from_document(
docs, doc_idx, max_seq_length=args.max_seq_len, short_seq_prob=args.short_seq_prob,
masked_lm_prob=args.masked_lm_prob, max_predictions_per_seq=args.max_predictions_per_seq,
whole_word_mask=args.do_whole_word_mask, vocab_list=vocab_list)
doc_instances = [json.dumps(instance) for instance in doc_instances]
for instance in doc_instances:
epoch_file.write(instance + '\n')
num_instances += 1
metrics_file = args.output_dir / "epoch_{}_metrics.json".format(epoch_num)
with metrics_file.open('w') as metrics_file:
metrics = {
"num_training_examples": num_instances,
"max_seq_len": args.max_seq_len
}
metrics_file.write(json.dumps(metrics))
def main():
parser = ArgumentParser()
parser.add_argument('--train_corpus', type=Path, required=True)
parser.add_argument("--output_dir", type=Path, required=True)
parser.add_argument("--bert_model", type=str, required=True,
choices=["bert-base-uncased", "bert-large-uncased", "bert-base-cased",
"bert-base-multilingual-uncased", "bert-base-chinese", "bert-base-multilingual-cased"])
parser.add_argument("--do_lower_case", action="store_true")
parser.add_argument("--do_whole_word_mask", action="store_true",
help="Whether to use whole word masking rather than per-WordPiece masking.")
parser.add_argument("--reduce_memory", action="store_true",
help="Reduce memory usage for large datasets by keeping data on disc rather than in memory")
parser.add_argument("--num_workers", type=int, default=1,
help="The number of workers to use to write the files")
parser.add_argument("--epochs_to_generate", type=int, default=3,
help="Number of epochs of data to pregenerate")
parser.add_argument("--max_seq_len", type=int, default=128)
parser.add_argument("--short_seq_prob", type=float, default=0.1,
help="Probability of making a short sentence as a training example")
parser.add_argument("--masked_lm_prob", type=float, default=0.15,
help="Probability of masking each token for the LM task")
parser.add_argument("--max_predictions_per_seq", type=int, default=20,
help="Maximum number of tokens to mask in each sequence")
args = parser.parse_args()
if args.num_workers > 1 and args.reduce_memory:
raise ValueError("Cannot use multiple workers while reducing memory")
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
vocab_list = list(tokenizer.vocab.keys())
with DocumentDatabase(reduce_memory=args.reduce_memory) as docs:
with args.train_corpus.open() as f:
doc = []
for line in tqdm(f, desc="Loading Dataset", unit=" lines"):
line = line.strip()
if line == "":
docs.add_document(doc)
doc = []
else:
tokens = tokenizer.tokenize(line)
doc.append(tokens)
if doc:
docs.add_document(doc) # If the last doc didn't end on a newline, make sure it still gets added
if len(docs) <= 1:
exit("ERROR: No document breaks were found in the input file! These are necessary to allow the script to "
"ensure that random NextSentences are not sampled from the same document. Please add blank lines to "
"indicate breaks between documents in your input file. If your dataset does not contain multiple "
"documents, blank lines can be inserted at any natural boundary, such as the ends of chapters, "
"sections or paragraphs.")
args.output_dir.mkdir(exist_ok=True)
if args.num_workers > 1:
writer_workers = Pool(min(args.num_workers, args.epochs_to_generate))
arguments = [(docs, vocab_list, args, idx) for idx in range(args.epochs_to_generate)]
writer_workers.starmap(create_training_file, arguments)
else:
for epoch in trange(args.epochs_to_generate, desc="Epoch"):
create_training_file(docs, vocab_list, args, epoch)
if __name__ == '__main__':
main()
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT finetuning runner."""
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse
import logging
import os
import random
from io import open
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset, RandomSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange
from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
from pytorch_transformers.modeling_bert import BertForPreTraining
from pytorch_transformers.tokenization_bert import BertTokenizer
from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt='%m/%d/%Y %H:%M:%S',
level=logging.INFO)
logger = logging.getLogger(__name__)
class BERTDataset(Dataset):
def __init__(self, corpus_path, tokenizer, seq_len, encoding="utf-8", corpus_lines=None, on_memory=True):
self.vocab = tokenizer.vocab
self.tokenizer = tokenizer
self.seq_len = seq_len
self.on_memory = on_memory
self.corpus_lines = corpus_lines # number of non-empty lines in input corpus
self.corpus_path = corpus_path
self.encoding = encoding
self.current_doc = 0 # to avoid random sentence from same doc
# for loading samples directly from file
self.sample_counter = 0 # used to keep track of full epochs on file
self.line_buffer = None # keep second sentence of a pair in memory and use as first sentence in next pair
# for loading samples in memory
self.current_random_doc = 0
self.num_docs = 0
self.sample_to_doc = [] # map sample index to doc and line
# load samples into memory
if on_memory:
self.all_docs = []
doc = []
self.corpus_lines = 0
with open(corpus_path, "r", encoding=encoding) as f:
for line in tqdm(f, desc="Loading Dataset", total=corpus_lines):
line = line.strip()
if line == "":
self.all_docs.append(doc)
doc = []
#remove last added sample because there won't be a subsequent line anymore in the doc
self.sample_to_doc.pop()
else:
#store as one sample
sample = {"doc_id": len(self.all_docs),
"line": len(doc)}
self.sample_to_doc.append(sample)
doc.append(line)
self.corpus_lines = self.corpus_lines + 1
# if last row in file is not empty
if self.all_docs[-1] != doc:
self.all_docs.append(doc)
self.sample_to_doc.pop()
self.num_docs = len(self.all_docs)
# load samples later lazily from disk
else:
if self.corpus_lines is None:
with open(corpus_path, "r", encoding=encoding) as f:
self.corpus_lines = 0
for line in tqdm(f, desc="Loading Dataset", total=corpus_lines):
if line.strip() == "":
self.num_docs += 1
else:
self.corpus_lines += 1
# if doc does not end with empty line
if line.strip() != "":
self.num_docs += 1
self.file = open(corpus_path, "r", encoding=encoding)
self.random_file = open(corpus_path, "r", encoding=encoding)
def __len__(self):
# last line of doc won't be used, because there's no "nextSentence". Additionally, we start counting at 0.
return self.corpus_lines - self.num_docs - 1
def __getitem__(self, item):
cur_id = self.sample_counter
self.sample_counter += 1
if not self.on_memory:
# after one epoch we start again from beginning of file
if cur_id != 0 and (cur_id % len(self) == 0):
self.file.close()
self.file = open(self.corpus_path, "r", encoding=self.encoding)
t1, t2, is_next_label = self.random_sent(item)
# tokenize
tokens_a = self.tokenizer.tokenize(t1)
tokens_b = self.tokenizer.tokenize(t2)
# combine to one sample
cur_example = InputExample(guid=cur_id, tokens_a=tokens_a, tokens_b=tokens_b, is_next=is_next_label)
# transform sample to features
cur_features = convert_example_to_features(cur_example, self.seq_len, self.tokenizer)
cur_tensors = (torch.tensor(cur_features.input_ids),
torch.tensor(cur_features.input_mask),
torch.tensor(cur_features.segment_ids),
torch.tensor(cur_features.lm_label_ids),
torch.tensor(cur_features.is_next))
return cur_tensors
def random_sent(self, index):
"""
Get one sample from corpus consisting of two sentences. With prob. 50% these are two subsequent sentences
from one doc. With 50% the second sentence will be a random one from another doc.
:param index: int, index of sample.
:return: (str, str, int), sentence 1, sentence 2, isNextSentence Label
"""
t1, t2 = self.get_corpus_line(index)
if random.random() > 0.5:
label = 0
else:
t2 = self.get_random_line()
label = 1
assert len(t1) > 0
assert len(t2) > 0
return t1, t2, label
def get_corpus_line(self, item):
"""
Get one sample from corpus consisting of a pair of two subsequent lines from the same doc.
:param item: int, index of sample.
:return: (str, str), two subsequent sentences from corpus
"""
t1 = ""
t2 = ""
assert item < self.corpus_lines
if self.on_memory:
sample = self.sample_to_doc[item]
t1 = self.all_docs[sample["doc_id"]][sample["line"]]
t2 = self.all_docs[sample["doc_id"]][sample["line"]+1]
# used later to avoid random nextSentence from same doc
self.current_doc = sample["doc_id"]
return t1, t2
else:
if self.line_buffer is None:
# read first non-empty line of file
while t1 == "" :
t1 = next(self.file).strip()
t2 = next(self.file).strip()
else:
# use t2 from previous iteration as new t1
t1 = self.line_buffer
t2 = next(self.file).strip()
# skip empty rows that are used for separating documents and keep track of current doc id
while t2 == "" or t1 == "":
t1 = next(self.file).strip()
t2 = next(self.file).strip()
self.current_doc = self.current_doc+1
self.line_buffer = t2
assert t1 != ""
assert t2 != ""
return t1, t2
def get_random_line(self):
"""
Get random line from another document for nextSentence task.
:return: str, content of one line
"""
# Similar to original tf repo: This outer loop should rarely go for more than one iteration for large
# corpora. However, just to be careful, we try to make sure that
# the random document is not the same as the document we're processing.
for _ in range(10):
if self.on_memory:
rand_doc_idx = random.randint(0, len(self.all_docs)-1)
rand_doc = self.all_docs[rand_doc_idx]
line = rand_doc[random.randrange(len(rand_doc))]
else:
rand_index = random.randint(1, self.corpus_lines if self.corpus_lines < 1000 else 1000)
#pick random line
for _ in range(rand_index):
line = self.get_next_line()
#check if our picked random line is really from another doc like we want it to be
if self.current_random_doc != self.current_doc:
break
return line
def get_next_line(self):
""" Gets next line of random_file and starts over when reaching end of file"""
try:
line = next(self.random_file).strip()
#keep track of which document we are currently looking at to later avoid having the same doc as t1
if line == "":
self.current_random_doc = self.current_random_doc + 1
line = next(self.random_file).strip()
except StopIteration:
self.random_file.close()
self.random_file = open(self.corpus_path, "r", encoding=self.encoding)
line = next(self.random_file).strip()
return line
class InputExample(object):
"""A single training/test example for the language model."""
def __init__(self, guid, tokens_a, tokens_b=None, is_next=None, lm_labels=None):
"""Constructs a InputExample.
Args:
guid: Unique id for the example.
tokens_a: string. The untokenized text of the first sequence. For single
sequence tasks, only this sequence must be specified.
tokens_b: (Optional) string. The untokenized text of the second sequence.
Only must be specified for sequence pair tasks.
label: (Optional) string. The label of the example. This should be
specified for train and dev examples, but not for test examples.
"""
self.guid = guid
self.tokens_a = tokens_a
self.tokens_b = tokens_b
self.is_next = is_next # nextSentence
self.lm_labels = lm_labels # masked words for language model
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self, input_ids, input_mask, segment_ids, is_next, lm_label_ids):
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.is_next = is_next
self.lm_label_ids = lm_label_ids
def random_word(tokens, tokenizer):
"""
Masking some random tokens for Language Model task with probabilities as in the original BERT paper.
:param tokens: list of str, tokenized sentence.
:param tokenizer: Tokenizer, object used for tokenization (we need it's vocab here)
:return: (list of str, list of int), masked tokens and related labels for LM prediction
"""
output_label = []
for i, token in enumerate(tokens):
prob = random.random()
# mask token with 15% probability
if prob < 0.15:
prob /= 0.15
# 80% randomly change token to mask token
if prob < 0.8:
tokens[i] = "[MASK]"
# 10% randomly change token to random token
elif prob < 0.9:
tokens[i] = random.choice(list(tokenizer.vocab.items()))[0]
# -> rest 10% randomly keep current token
# append current token to output (we will predict these later)
try:
output_label.append(tokenizer.vocab[token])
except KeyError:
# For unknown words (should not occur with BPE vocab)
output_label.append(tokenizer.vocab["[UNK]"])
logger.warning("Cannot find token '{}' in vocab. Using [UNK] insetad".format(token))
else:
# no masking token (will be ignored by loss function later)
output_label.append(-1)
return tokens, output_label
def convert_example_to_features(example, max_seq_length, tokenizer):
"""
Convert a raw sample (pair of sentences as tokenized strings) into a proper training sample with
IDs, LM labels, input_mask, CLS and SEP tokens etc.
:param example: InputExample, containing sentence input as strings and is_next label
:param max_seq_length: int, maximum length of sequence.
:param tokenizer: Tokenizer
:return: InputFeatures, containing all inputs and labels of one sample as IDs (as used for model training)
"""
tokens_a = example.tokens_a
tokens_b = example.tokens_b
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3"
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
tokens_a, t1_label = random_word(tokens_a, tokenizer)
tokens_b, t2_label = random_word(tokens_b, tokenizer)
# concatenate lm labels and account for CLS, SEP, SEP
lm_label_ids = ([-1] + t1_label + [-1] + t2_label + [-1])
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambigiously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
assert len(tokens_b) > 0
for token in tokens_b:
tokens.append(token)
segment_ids.append(1)
tokens.append("[SEP]")
segment_ids.append(1)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1] * len(input_ids)
# Zero-pad up to the sequence length.
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
lm_label_ids.append(-1)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
assert len(lm_label_ids) == max_seq_length
if example.guid < 5:
logger.info("*** Example ***")
logger.info("guid: %s" % (example.guid))
logger.info("tokens: %s" % " ".join(
[str(x) for x in tokens]))
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
logger.info(
"segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
logger.info("LM label: %s " % (lm_label_ids))
logger.info("Is next sentence label: %s " % (example.is_next))
features = InputFeatures(input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
lm_label_ids=lm_label_ids,
is_next=example.is_next)
return features
def main():
parser = argparse.ArgumentParser()
## Required parameters
parser.add_argument("--train_corpus",
default=None,
type=str,
required=True,
help="The input train corpus.")
parser.add_argument("--bert_model", default=None, type=str, required=True,
help="Bert pre-trained model selected in the list: bert-base-uncased, "
"bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
parser.add_argument("--output_dir",
default=None,
type=str,
required=True,
help="The output directory where the model checkpoints will be written.")
## Other parameters
parser.add_argument("--max_seq_length",
default=128,
type=int,
help="The maximum total input sequence length after WordPiece tokenization. \n"
"Sequences longer than this will be truncated, and sequences shorter \n"
"than this will be padded.")
parser.add_argument("--do_train",
action='store_true',
help="Whether to run training.")
parser.add_argument("--train_batch_size",
default=32,
type=int,
help="Total batch size for training.")
parser.add_argument("--learning_rate",
default=3e-5,
type=float,
help="The initial learning rate for Adam.")
parser.add_argument("--adam_epsilon",
default=1e-8,
type=float,
help="Epsilon for Adam optimizer.")
parser.add_argument("--num_train_epochs",
default=3.0,
type=float,
help="Total number of training epochs to perform.")
parser.add_argument("--warmup_steps",
default=0,
type=int,
help="Linear warmup over warmup_steps.")
parser.add_argument("--no_cuda",
action='store_true',
help="Whether not to use CUDA when available")
parser.add_argument("--on_memory",
action='store_true',
help="Whether to load train samples into memory or use disk")
parser.add_argument("--do_lower_case",
action='store_true',
help="Whether to lower case the input text. True for uncased models, False for cased models.")
parser.add_argument("--local_rank",
type=int,
default=-1,
help="local_rank for distributed training on gpus")
parser.add_argument('--seed',
type=int,
default=42,
help="random seed for initialization")
parser.add_argument('--gradient_accumulation_steps',
type=int,
default=1,
help="Number of updates steps to accumualte before performing a backward/update pass.")
parser.add_argument('--fp16',
action='store_true',
help="Whether to use 16-bit float precision instead of 32-bit")
parser.add_argument('--loss_scale',
type = float, default = 0,
help = "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
"0 (default value): dynamic loss scaling.\n"
"Positive power of 2: static loss scaling value.\n")
args = parser.parse_args()
if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
n_gpu = torch.cuda.device_count()
else:
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank)
n_gpu = 1
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.distributed.init_process_group(backend='nccl')
logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
device, n_gpu, bool(args.local_rank != -1), args.fp16))
if args.gradient_accumulation_steps < 1:
raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
args.gradient_accumulation_steps))
args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if n_gpu > 0:
torch.cuda.manual_seed_all(args.seed)
if not args.do_train:
raise ValueError("Training is currently the only implemented execution option. Please set `do_train`.")
if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
if not os.path.exists(args.output_dir) and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
os.makedirs(args.output_dir)
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
#train_examples = None
num_train_optimization_steps = None
if args.do_train:
print("Loading Train Dataset", args.train_corpus)
train_dataset = BERTDataset(args.train_corpus, tokenizer, seq_len=args.max_seq_length,
corpus_lines=None, on_memory=args.on_memory)
num_train_optimization_steps = int(
len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
if args.local_rank != -1:
num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
# Prepare model
model = BertForPreTraining.from_pretrained(args.bert_model)
if args.fp16:
model.half()
model.to(device)
if args.local_rank != -1:
try:
from apex.parallel import DistributedDataParallel as DDP
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
model = DDP(model)
elif n_gpu > 1:
model = torch.nn.DataParallel(model)
# Prepare optimizer
if args.do_train:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
if args.fp16:
try:
from apex.optimizers import FP16_Optimizer
from apex.optimizers import FusedAdam
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
optimizer = FusedAdam(optimizer_grouped_parameters,
lr=args.learning_rate,
bias_correction=False,
max_grad_norm=1.0)
if args.loss_scale == 0:
optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
else:
optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
else:
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps)
global_step = 0
if args.do_train:
logger.info("***** Running training *****")
logger.info(" Num examples = %d", len(train_dataset))
logger.info(" Batch size = %d", args.train_batch_size)
logger.info(" Num steps = %d", num_train_optimization_steps)
if args.local_rank == -1:
train_sampler = RandomSampler(train_dataset)
else:
#TODO: check if this works with current data generator from disk that relies on next(file)
# (it doesn't return item back by index)
train_sampler = DistributedSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
model.train()
for _ in trange(int(args.num_train_epochs), desc="Epoch"):
tr_loss = 0
nb_tr_examples, nb_tr_steps = 0, 0
for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
batch = tuple(t.to(device) for t in batch)
input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch
outputs = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next)
loss = outputs[0]
if n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu.
if args.gradient_accumulation_steps > 1:
loss = loss / args.gradient_accumulation_steps
if args.fp16:
optimizer.backward(loss)
else:
loss.backward()
tr_loss += loss.item()
nb_tr_examples += input_ids.size(0)
nb_tr_steps += 1
if (step + 1) % args.gradient_accumulation_steps == 0:
optimizer.step()
scheduler.step() # Update learning rate schedule
optimizer.zero_grad()
global_step += 1
# Save a trained model
if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
logger.info("** ** * Saving fine - tuned model ** ** * ")
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
model_to_save.save_pretrained(args.output_dir)
tokenizer.save_pretrained(args.output_dir)
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()
def accuracy(out, labels):
outputs = np.argmax(out, axis=1)
return np.sum(outputs == labels)
if __name__ == "__main__":
main()
......@@ -39,12 +39,17 @@ from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
XLMConfig, XLMForSequenceClassification,
XLMTokenizer, XLNetConfig,
XLNetForSequenceClassification,
XLNetTokenizer)
XLNetTokenizer,
DistilBertConfig,
DistilBertForSequenceClassification,
DistilBertTokenizer)
from pytorch_transformers import AdamW, WarmupLinearSchedule
from utils_glue import (compute_metrics, convert_examples_to_features,
output_modes, processors)
from pytorch_transformers import glue_compute_metrics as compute_metrics
from pytorch_transformers import glue_output_modes as output_modes
from pytorch_transformers import glue_processors as processors
from pytorch_transformers import glue_convert_examples_to_features as convert_examples_to_features
logger = logging.getLogger(__name__)
......@@ -55,6 +60,7 @@ MODEL_CLASSES = {
'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer)
}
......@@ -128,7 +134,7 @@ def train(args, train_dataset, model, tokenizer):
batch = tuple(t.to(args.device) for t in batch)
inputs = {'input_ids': batch[0],
'attention_mask': batch[1],
'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM and RoBERTa don't use segment_ids
'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM, DistilBERT and RoBERTa don't use segment_ids
'labels': batch[3]}
outputs = model(**inputs)
loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc)
......@@ -218,7 +224,7 @@ def evaluate(args, model, tokenizer, prefix=""):
with torch.no_grad():
inputs = {'input_ids': batch[0],
'attention_mask': batch[1],
'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM and RoBERTa don't use segment_ids
'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM, DistilBERT and RoBERTa don't use segment_ids
'labels': batch[3]}
outputs = model(**inputs)
tmp_eval_loss, logits = outputs[:2]
......@@ -273,11 +279,6 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
label_list[1], label_list[2] = label_list[2], label_list[1]
examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode,
cls_token_at_end=bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end
cls_token=tokenizer.cls_token,
cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
sep_token=tokenizer.sep_token,
sep_token_extra=bool(args.model_type in ['roberta']), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet
pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
......
......@@ -14,7 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Fine-tuning the library models for language modeling on WikiText-2 (GPT, GPT-2, BERT, RoBERTa).
Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
using a masked language modeling (MLM) loss.
"""
......@@ -39,7 +39,8 @@ from pytorch_transformers import (WEIGHTS_NAME, AdamW, WarmupLinearSchedule,
BertConfig, BertForMaskedLM, BertTokenizer,
GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
RobertaConfig, RobertaForMaskedLM, RobertaTokenizer)
RobertaConfig, RobertaForMaskedLM, RobertaTokenizer,
DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
logger = logging.getLogger(__name__)
......@@ -49,7 +50,8 @@ MODEL_CLASSES = {
'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer)
'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
}
......@@ -73,7 +75,7 @@ class TextDataset(Dataset):
tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
while len(tokenized_text) >= block_size: # Truncate in block of block_size
self.examples.append(tokenizer.add_special_tokens_single_sentence(tokenized_text[:block_size]))
self.examples.append(tokenizer.add_special_tokens_single_sequence(tokenized_text[:block_size]))
tokenized_text = tokenized_text[block_size:]
# Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
# If your dataset is small, first you should loook for a bigger one :-) and second you
......@@ -247,7 +249,6 @@ def evaluate(args, model, tokenizer, prefix=""):
# Loop to handle MNLI double evaluation (matched, mis-matched)
eval_output_dir = args.output_dir
results = {}
eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)
if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
......@@ -289,7 +290,7 @@ def evaluate(args, model, tokenizer, prefix=""):
logger.info(" %s = %s", key, str(result[key]))
writer.write("%s = %s\n" % (key, str(result[key])))
return results
return result
def main():
......@@ -381,7 +382,7 @@ def main():
parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
args = parser.parse_args()
if args.model_type in ["bert", "roberta"] and not args.mlm:
if args.model_type in ["bert", "roberta", "distilbert"] and not args.mlm:
raise ValueError("BERT and RoBERTa do not have LM heads but masked LM heads. They must be run using the --mlm "
"flag (masked language modeling).")
if args.eval_data_file is None and args.do_eval:
......
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" Finetuning the library models for multiple choice (Bert, Roberta, XLNet)."""
from __future__ import absolute_import, division, print_function
import argparse
import glob
import logging
import os
import random
import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from tensorboardX import SummaryWriter
from tqdm import tqdm, trange
from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
BertForMultipleChoice, BertTokenizer,
XLNetConfig, XLNetForMultipleChoice,
XLNetTokenizer, RobertaConfig,
RobertaForMultipleChoice, RobertaTokenizer)
from pytorch_transformers import AdamW, WarmupLinearSchedule
from utils_multiple_choice import (convert_examples_to_features, processors)
logger = logging.getLogger(__name__)
ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, RobertaConfig)), ())
MODEL_CLASSES = {
'bert': (BertConfig, BertForMultipleChoice, BertTokenizer),
'xlnet': (XLNetConfig, XLNetForMultipleChoice, XLNetTokenizer),
'roberta': (RobertaConfig, RobertaForMultipleChoice, RobertaTokenizer)
}
def select_field(features, field):
return [
[
choice[field]
for choice in feature.choices_features
]
for feature in features
]
def simple_accuracy(preds, labels):
return (preds == labels).mean()
def set_seed(args):
random.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if args.n_gpu > 0:
torch.cuda.manual_seed_all(args.seed)
def train(args, train_dataset, model, tokenizer):
""" Train the model """
if args.local_rank in [-1, 0]:
tb_writer = SummaryWriter()
args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
if args.max_steps > 0:
t_total = args.max_steps
args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
else:
t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
{'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
if args.fp16:
try:
from apex import amp
except ImportError:
raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
# multi-gpu training (should be after apex fp16 initialization)
if args.n_gpu > 1:
model = torch.nn.DataParallel(model)
# Distributed training (should be after apex fp16 initialization)
if args.local_rank != -1:
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
output_device=args.local_rank,
find_unused_parameters=True)
# Train!
logger.info("***** Running training *****")
logger.info(" Num examples = %d", len(train_dataset))
logger.info(" Num Epochs = %d", args.num_train_epochs)
logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
logger.info(" Total train batch size (w. parallel, distributed & accumulation) = %d",
args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
logger.info(" Total optimization steps = %d", t_total)
global_step = 0
tr_loss, logging_loss = 0.0, 0.0
best_dev_acc, best_dev_loss = 0.0, 99999999999.0
best_steps = 0
model.zero_grad()
train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
set_seed(args) # Added here for reproductibility (even between python 2 and 3)
for _ in train_iterator:
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
for step, batch in enumerate(epoch_iterator):
model.train()
batch = tuple(t.to(args.device) for t in batch)
inputs = {'input_ids': batch[0],
'attention_mask': batch[1],
'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids
'labels': batch[3]}
outputs = model(**inputs)
loss = outputs[0] # model outputs are always tuple in pytorch-transformers (see doc)
if args.n_gpu > 1:
loss = loss.mean() # mean() to average on multi-gpu parallel training
if args.gradient_accumulation_steps > 1:
loss = loss / args.gradient_accumulation_steps
if args.fp16:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
else:
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
tr_loss += loss.item()
if (step + 1) % args.gradient_accumulation_steps == 0:
optimizer.step()
scheduler.step() # Update learning rate schedule
model.zero_grad()
global_step += 1
if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
# Log metrics
if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well
results = evaluate(args, model, tokenizer)
for key, value in results.items():
tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
if results["eval_acc"] > best_dev_acc:
best_dev_acc = results["eval_acc"]
best_dev_loss = results["eval_loss"]
best_steps = global_step
if args.do_test:
results_test = evaluate(args, model, tokenizer, test=True)
for key, value in results_test.items():
tb_writer.add_scalar('test_{}'.format(key), value, global_step)
logger.info("test acc: %s, loss: %s, global steps: %s", str(results_test['eval_acc']), str(results_test['eval_loss']), str(global_step))
tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
logger.info("Average loss: %s at global step: %s", str((tr_loss - logging_loss)/args.logging_steps), str(global_step))
logging_loss = tr_loss
if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
# Save model checkpoint
output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
if not os.path.exists(output_dir):
os.makedirs(output_dir)
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
model_to_save.save_pretrained(output_dir)
tokenizer.save_vocabulary(output_dir)
torch.save(args, os.path.join(output_dir, 'training_args.bin'))
logger.info("Saving model checkpoint to %s", output_dir)
if args.max_steps > 0 and global_step > args.max_steps:
epoch_iterator.close()
break
if args.max_steps > 0 and global_step > args.max_steps:
train_iterator.close()
break
if args.local_rank in [-1, 0]:
tb_writer.close()
return global_step, tr_loss / global_step, best_steps
def evaluate(args, model, tokenizer, prefix="", test=False):
eval_task_names = (args.task_name,)
eval_outputs_dirs = (args.output_dir,)
results = {}
for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=not test, test=test)
if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
os.makedirs(eval_output_dir)
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
# Note that DistributedSampler samples randomly
eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
# Eval!
logger.info("***** Running evaluation {} *****".format(prefix))
logger.info(" Num examples = %d", len(eval_dataset))
logger.info(" Batch size = %d", args.eval_batch_size)
eval_loss = 0.0
nb_eval_steps = 0
preds = None
out_label_ids = None
for batch in tqdm(eval_dataloader, desc="Evaluating"):
model.eval()
batch = tuple(t.to(args.device) for t in batch)
with torch.no_grad():
inputs = {'input_ids': batch[0],
'attention_mask': batch[1],
'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids
'labels': batch[3]}
outputs = model(**inputs)
tmp_eval_loss, logits = outputs[:2]
eval_loss += tmp_eval_loss.mean().item()
nb_eval_steps += 1
if preds is None:
preds = logits.detach().cpu().numpy()
out_label_ids = inputs['labels'].detach().cpu().numpy()
else:
preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
eval_loss = eval_loss / nb_eval_steps
preds = np.argmax(preds, axis=1)
acc = simple_accuracy(preds, out_label_ids)
result = {"eval_acc": acc, "eval_loss": eval_loss}
results.update(result)
output_eval_file = os.path.join(eval_output_dir, "is_test_" + str(test).lower() + "_eval_results.txt")
with open(output_eval_file, "w") as writer:
logger.info("***** Eval results {} *****".format(str(prefix) + " is test:" + str(test)))
writer.write("model =%s\n" % str(args.model_name_or_path))
writer.write("total batch size=%d\n" % (args.per_gpu_train_batch_size * args.gradient_accumulation_steps *
(torch.distributed.get_world_size() if args.local_rank != -1 else 1)))
writer.write("train num epochs=%d\n" % args.num_train_epochs)
writer.write("fp16 =%s\n" % args.fp16)
writer.write("max seq length =%d\n" % args.max_seq_length)
for key in sorted(result.keys()):
logger.info(" %s = %s", key, str(result[key]))
writer.write("%s = %s\n" % (key, str(result[key])))
return results
def load_and_cache_examples(args, task, tokenizer, evaluate=False, test=False):
if args.local_rank not in [-1, 0]:
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
processor = processors[task]()
# Load data features from cache or dataset file
if evaluate:
cached_mode = 'dev'
elif test:
cached_mode = 'test'
else:
cached_mode = 'train'
assert (evaluate == True and test == True) == False
cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
cached_mode,
list(filter(None, args.model_name_or_path.split('/'))).pop(),
str(args.max_seq_length),
str(task)))
if os.path.exists(cached_features_file):
logger.info("Loading features from cached file %s", cached_features_file)
features = torch.load(cached_features_file)
else:
logger.info("Creating features from dataset file at %s", args.data_dir)
label_list = processor.get_labels()
if evaluate:
examples = processor.get_dev_examples(args.data_dir)
elif test:
examples = processor.get_test_examples(args.data_dir)
else:
examples = processor.get_train_examples(args.data_dir)
logger.info("Training number: %s", str(len(examples)))
features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer,
cls_token_at_end=bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end
cls_token=tokenizer.cls_token,
sep_token=tokenizer.sep_token,
sep_token_extra=bool(args.model_type in ['roberta']),
cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
pad_on_left=bool(args.model_type in ['xlnet']), # pad on the left for xlnet
pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0)
if args.local_rank in [-1, 0]:
logger.info("Saving features into cached file %s", cached_features_file)
torch.save(features, cached_features_file)
if args.local_rank == 0:
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
# Convert to Tensors and build dataset
all_input_ids = torch.tensor(select_field(features, 'input_ids'), dtype=torch.long)
all_input_mask = torch.tensor(select_field(features, 'input_mask'), dtype=torch.long)
all_segment_ids = torch.tensor(select_field(features, 'segment_ids'), dtype=torch.long)
all_label_ids = torch.tensor([f.label for f in features], dtype=torch.long)
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
return dataset
def main():
parser = argparse.ArgumentParser()
## Required parameters
parser.add_argument("--data_dir", default=None, type=str, required=True,
help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
parser.add_argument("--model_type", default=None, type=str, required=True,
help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
parser.add_argument("--task_name", default=None, type=str, required=True,
help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
parser.add_argument("--output_dir", default=None, type=str, required=True,
help="The output directory where the model predictions and checkpoints will be written.")
## Other parameters
parser.add_argument("--config_name", default="", type=str,
help="Pretrained config name or path if not the same as model_name")
parser.add_argument("--tokenizer_name", default="", type=str,
help="Pretrained tokenizer name or path if not the same as model_name")
parser.add_argument("--cache_dir", default="", type=str,
help="Where do you want to store the pre-trained models downloaded from s3")
parser.add_argument("--max_seq_length", default=128, type=int,
help="The maximum total input sequence length after tokenization. Sequences longer "
"than this will be truncated, sequences shorter will be padded.")
parser.add_argument("--do_train", action='store_true',
help="Whether to run training.")
parser.add_argument("--do_eval", action='store_true',
help="Whether to run eval on the dev set.")
parser.add_argument("--do_test", action='store_true', help='Whether to run test on the test set')
parser.add_argument("--evaluate_during_training", action='store_true',
help="Rul evaluation during training at each logging step.")
parser.add_argument("--do_lower_case", action='store_true',
help="Set this flag if you are using an uncased model.")
parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
help="Batch size per GPU/CPU for training.")
parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
help="Batch size per GPU/CPU for evaluation.")
parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
help="Number of updates steps to accumulate before performing a backward/update pass.")
parser.add_argument("--learning_rate", default=5e-5, type=float,
help="The initial learning rate for Adam.")
parser.add_argument("--weight_decay", default=0.0, type=float,
help="Weight deay if we apply some.")
parser.add_argument("--adam_epsilon", default=1e-8, type=float,
help="Epsilon for Adam optimizer.")
parser.add_argument("--max_grad_norm", default=1.0, type=float,
help="Max gradient norm.")
parser.add_argument("--num_train_epochs", default=3.0, type=float,
help="Total number of training epochs to perform.")
parser.add_argument("--max_steps", default=-1, type=int,
help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
parser.add_argument("--warmup_steps", default=0, type=int,
help="Linear warmup over warmup_steps.")
parser.add_argument('--logging_steps', type=int, default=50,
help="Log every X updates steps.")
parser.add_argument('--save_steps', type=int, default=50,
help="Save checkpoint every X updates steps.")
parser.add_argument("--eval_all_checkpoints", action='store_true',
help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
parser.add_argument("--no_cuda", action='store_true',
help="Avoid using CUDA when available")
parser.add_argument('--overwrite_output_dir', action='store_true',
help="Overwrite the content of the output directory")
parser.add_argument('--overwrite_cache', action='store_true',
help="Overwrite the cached training and evaluation sets")
parser.add_argument('--seed', type=int, default=42,
help="random seed for initialization")
parser.add_argument('--fp16', action='store_true',
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
parser.add_argument('--fp16_opt_level', type=str, default='O1',
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
"See details at https://nvidia.github.io/apex/amp.html")
parser.add_argument("--local_rank", type=int, default=-1,
help="For distributed training: local_rank")
parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
args = parser.parse_args()
if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
# Setup distant debugging if needed
if args.server_ip and args.server_port:
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
import ptvsd
print("Waiting for debugger attach")
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
ptvsd.wait_for_attach()
# Setup CUDA, GPU & distributed training
if args.local_rank == -1 or args.no_cuda:
device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
args.n_gpu = torch.cuda.device_count()
else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
torch.cuda.set_device(args.local_rank)
device = torch.device("cuda", args.local_rank)
torch.distributed.init_process_group(backend='nccl')
args.n_gpu = 1
args.device = device
# Setup logging
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
datefmt = '%m/%d/%Y %H:%M:%S',
level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
# Set seed
set_seed(args)
# Prepare GLUE task
args.task_name = args.task_name.lower()
if args.task_name not in processors:
raise ValueError("Task not found: %s" % (args.task_name))
processor = processors[args.task_name]()
label_list = processor.get_labels()
num_labels = len(label_list)
# Load pretrained model and tokenizer
if args.local_rank not in [-1, 0]:
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
args.model_type = args.model_type.lower()
config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name)
tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
if args.local_rank == 0:
torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab
model.to(args.device)
logger.info("Training/evaluation parameters %s", args)
best_steps = 0
# Training
if args.do_train:
train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
global_step, tr_loss, best_steps = train(args, train_dataset, model, tokenizer)
logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
# Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
# Create output directory if needed
if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
os.makedirs(args.output_dir)
logger.info("Saving model checkpoint to %s", args.output_dir)
# Save a trained model, configuration and tokenizer using `save_pretrained()`.
# They can then be reloaded using `from_pretrained()`
model_to_save = model.module if hasattr(model, 'module') else model # Take care of distributed/parallel training
model_to_save.save_pretrained(args.output_dir)
tokenizer.save_pretrained(args.output_dir)
# Good practice: save your training arguments together with the trained model
torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
# Load a trained model and vocabulary that you have fine-tuned
model = model_class.from_pretrained(args.output_dir)
tokenizer = tokenizer_class.from_pretrained(args.output_dir)
model.to(args.device)
# Evaluation
results = {}
if args.do_eval and args.local_rank in [-1, 0]:
if not args.do_train:
args.output_dir = args.model_name_or_path
checkpoints = [args.output_dir]
if args.eval_all_checkpoints:
checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
logger.info("Evaluate the following checkpoints: %s", checkpoints)
for checkpoint in checkpoints:
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
model = model_class.from_pretrained(checkpoint)
model.to(args.device)
result = evaluate(args, model, tokenizer, prefix=global_step)
result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
results.update(result)
if args.do_test and args.local_rank in [-1, 0]:
if not args.do_train:
args.output_dir = args.model_name_or_path
checkpoints = [args.output_dir]
# if args.eval_all_checkpoints: # can not use this to do test!!
# checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
# logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN) # Reduce logging
logger.info("Evaluate the following checkpoints: %s", checkpoints)
for checkpoint in checkpoints:
global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
model = model_class.from_pretrained(checkpoint)
model.to(args.device)
result = evaluate(args, model, tokenizer, prefix=global_step, test=True)
result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
results.update(result)
if best_steps:
logger.info("best steps of eval acc is the following checkpoints: %s", best_steps)
return results
if __name__ == "__main__":
main()
......@@ -37,7 +37,8 @@ from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
XLMConfig, XLMForQuestionAnswering,
XLMTokenizer, XLNetConfig,
XLNetForQuestionAnswering,
XLNetTokenizer)
XLNetTokenizer,
DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
from pytorch_transformers import AdamW, WarmupLinearSchedule
......@@ -59,6 +60,7 @@ MODEL_CLASSES = {
'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
}
def set_seed(args):
......
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" BERT multiple choice fine-tuning: utilities to work with multiple choice tasks of reading comprehension """
from __future__ import absolute_import, division, print_function
import logging
import os
import sys
from io import open
import json
import csv
import glob
import tqdm
logger = logging.getLogger(__name__)
class InputExample(object):
"""A single training/test example for multiple choice"""
def __init__(self, example_id, question, contexts, endings, label=None):
"""Constructs a InputExample.
Args:
example_id: Unique id for the example.
contexts: list of str. The untokenized text of the first sequence (context of corresponding question).
question: string. The untokenized text of the second sequence (qustion).
endings: list of str. multiple choice's options. Its length must be equal to contexts' length.
label: (Optional) string. The label of the example. This should be
specified for train and dev examples, but not for test examples.
"""
self.example_id = example_id
self.question = question
self.contexts = contexts
self.endings = endings
self.label = label
class InputFeatures(object):
def __init__(self,
example_id,
choices_features,
label
):
self.example_id = example_id
self.choices_features = [
{
'input_ids': input_ids,
'input_mask': input_mask,
'segment_ids': segment_ids
}
for _, input_ids, input_mask, segment_ids in choices_features
]
self.label = label
class DataProcessor(object):
"""Base class for data converters for multiple choice data sets."""
def get_train_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the train set."""
raise NotImplementedError()
def get_dev_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the dev set."""
raise NotImplementedError()
def get_test_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the test set."""
raise NotImplementedError()
def get_labels(self):
"""Gets the list of labels for this data set."""
raise NotImplementedError()
class RaceProcessor(DataProcessor):
"""Processor for the RACE data set."""
def get_train_examples(self, data_dir):
"""See base class."""
logger.info("LOOKING AT {} train".format(data_dir))
high = os.path.join(data_dir, 'train/high')
middle = os.path.join(data_dir, 'train/middle')
high = self._read_txt(high)
middle = self._read_txt(middle)
return self._create_examples(high + middle, 'train')
def get_dev_examples(self, data_dir):
"""See base class."""
logger.info("LOOKING AT {} dev".format(data_dir))
high = os.path.join(data_dir, 'dev/high')
middle = os.path.join(data_dir, 'dev/middle')
high = self._read_txt(high)
middle = self._read_txt(middle)
return self._create_examples(high + middle, 'dev')
def get_test_examples(self, data_dir):
"""See base class."""
logger.info("LOOKING AT {} test".format(data_dir))
high = os.path.join(data_dir, 'test/high')
middle = os.path.join(data_dir, 'test/middle')
high = self._read_txt(high)
middle = self._read_txt(middle)
return self._create_examples(high + middle, 'test')
def get_labels(self):
"""See base class."""
return ["0", "1", "2", "3"]
def _read_txt(self, input_dir):
lines = []
files = glob.glob(input_dir + "/*txt")
for file in tqdm.tqdm(files, desc="read files"):
with open(file, 'r', encoding='utf-8') as fin:
data_raw = json.load(fin)
data_raw["race_id"] = file
lines.append(data_raw)
return lines
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (_, data_raw) in enumerate(lines):
race_id = "%s-%s" % (set_type, data_raw["race_id"])
article = data_raw["article"]
for i in range(len(data_raw["answers"])):
truth = str(ord(data_raw['answers'][i]) - ord('A'))
question = data_raw['questions'][i]
options = data_raw['options'][i]
examples.append(
InputExample(
example_id=race_id,
question=question,
contexts=[article, article, article, article], # this is not efficient but convenient
endings=[options[0], options[1], options[2], options[3]],
label=truth))
return examples
class SwagProcessor(DataProcessor):
"""Processor for the SWAG data set."""
def get_train_examples(self, data_dir):
"""See base class."""
logger.info("LOOKING AT {} train".format(data_dir))
return self._create_examples(self._read_csv(os.path.join(data_dir, "train.csv")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
logger.info("LOOKING AT {} dev".format(data_dir))
return self._create_examples(self._read_csv(os.path.join(data_dir, "val.csv")), "dev")
def get_test_examples(self, data_dir):
"""See base class."""
logger.info("LOOKING AT {} dev".format(data_dir))
raise ValueError(
"For swag testing, the input file does not contain a label column. It can not be tested in current code"
"setting!"
)
return self._create_examples(self._read_csv(os.path.join(data_dir, "test.csv")), "test")
def get_labels(self):
"""See base class."""
return ["0", "1", "2", "3"]
def _read_csv(self, input_file):
with open(input_file, 'r', encoding='utf-8') as f:
reader = csv.reader(f)
lines = []
for line in reader:
if sys.version_info[0] == 2:
line = list(unicode(cell, 'utf-8') for cell in line)
lines.append(line)
return lines
def _create_examples(self, lines, type):
"""Creates examples for the training and dev sets."""
if type == "train" and lines[0][-1] != 'label':
raise ValueError(
"For training, the input file must contain a label column."
)
examples = [
InputExample(
example_id=line[2],
question=line[5], # in the swag dataset, the
# common beginning of each
# choice is stored in "sent2".
contexts = [line[4], line[4], line[4], line[4]],
endings = [line[7], line[8], line[9], line[10]],
label=line[11]
) for line in lines[1:] # we skip the line with the column names
]
return examples
class ArcProcessor(DataProcessor):
"""Processor for the ARC data set (request from allennlp)."""
def get_train_examples(self, data_dir):
"""See base class."""
logger.info("LOOKING AT {} train".format(data_dir))
return self._create_examples(self._read_json(os.path.join(data_dir, "train.jsonl")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
logger.info("LOOKING AT {} dev".format(data_dir))
return self._create_examples(self._read_json(os.path.join(data_dir, "dev.jsonl")), "dev")
def get_test_examples(self, data_dir):
logger.info("LOOKING AT {} test".format(data_dir))
return self._create_examples(self._read_json(os.path.join(data_dir, "test.jsonl")), "test")
def get_labels(self):
"""See base class."""
return ["0", "1", "2", "3"]
def _read_json(self, input_file):
with open(input_file, 'r', encoding='utf-8') as fin:
lines = fin.readlines()
return lines
def _create_examples(self, lines, type):
"""Creates examples for the training and dev sets."""
#There are two types of labels. They should be normalized
def normalize(truth):
if truth in "ABCD":
return ord(truth) - ord("A")
elif truth in "1234":
return int(truth) - 1
else:
logger.info("truth ERROR! %s", str(truth))
return None
examples = []
three_choice = 0
four_choice = 0
five_choice = 0
other_choices = 0
# we deleted example which has more than or less than four choices
for line in tqdm.tqdm(lines, desc="read arc data"):
data_raw = json.loads(line.strip("\n"))
if len(data_raw["question"]["choices"]) == 3:
three_choice += 1
continue
elif len(data_raw["question"]["choices"]) == 5:
five_choice += 1
continue
elif len(data_raw["question"]["choices"]) != 4:
other_choices += 1
continue
four_choice += 1
truth = str(normalize(data_raw["answerKey"]))
assert truth != "None"
question_choices = data_raw["question"]
question = question_choices["stem"]
id = data_raw["id"]
options = question_choices["choices"]
if len(options) == 4:
examples.append(
InputExample(
example_id = id,
question=question,
contexts=[options[0]["para"].replace("_", ""), options[1]["para"].replace("_", ""),
options[2]["para"].replace("_", ""), options[3]["para"].replace("_", "")],
endings=[options[0]["text"], options[1]["text"], options[2]["text"], options[3]["text"]],
label=truth))
if type == "train":
assert len(examples) > 1
assert examples[0].label is not None
logger.info("len examples: %s}", str(len(examples)))
logger.info("Three choices: %s", str(three_choice))
logger.info("Five choices: %s", str(five_choice))
logger.info("Other choices: %s", str(other_choices))
logger.info("four choices: %s", str(four_choice))
return examples
def convert_examples_to_features(examples, label_list, max_seq_length,
tokenizer,
cls_token_at_end=False,
cls_token='[CLS]',
cls_token_segment_id=1,
sep_token='[SEP]',
sequence_a_segment_id=0,
sequence_b_segment_id=1,
sep_token_extra=False,
pad_token_segment_id=0,
pad_on_left=False,
pad_token=0,
mask_padding_with_zero=True):
""" Loads a data file into a list of `InputBatch`s
`cls_token_at_end` define the location of the CLS token:
- False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
- True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
`cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
"""
label_map = {label : i for i, label in enumerate(label_list)}
features = []
for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
if ex_index % 10000 == 0:
logger.info("Writing example %d of %d" % (ex_index, len(examples)))
choices_features = []
for ending_idx, (context, ending) in enumerate(zip(example.contexts, example.endings)):
tokens_a = tokenizer.tokenize(context)
tokens_b = None
if example.question.find("_") != -1:
#this is for cloze question
tokens_b = tokenizer.tokenize(example.question.replace("_", ending))
else:
tokens_b = tokenizer.tokenize(example.question + " " + ending)
# you can add seq token between quesiotn and ending. This does not make too much difference.
# tokens_b = tokenizer.tokenize(example.question)
# tokens_b += [sep_token]
# if sep_token_extra:
# tokens_b += [sep_token]
# tokens_b += tokenizer.tokenize(ending)
special_tokens_count = 4 if sep_token_extra else 3
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambiguously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens = tokens_a + [sep_token]
if sep_token_extra:
# roberta uses an extra separator b/w pairs of sentences
tokens += [sep_token]
segment_ids = [sequence_a_segment_id] * len(tokens)
if tokens_b:
tokens += tokens_b + [sep_token]
segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)
if cls_token_at_end:
tokens = tokens + [cls_token]
segment_ids = segment_ids + [cls_token_segment_id]
else:
tokens = [cls_token] + tokens
segment_ids = [cls_token_segment_id] + segment_ids
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
# Zero-pad up to the sequence length.
padding_length = max_seq_length - len(input_ids)
if pad_on_left:
input_ids = ([pad_token] * padding_length) + input_ids
input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
else:
input_ids = input_ids + ([pad_token] * padding_length)
input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
choices_features.append((tokens, input_ids, input_mask, segment_ids))
label = label_map[example.label]
if ex_index < 2:
logger.info("*** Example ***")
logger.info("race_id: {}".format(example.example_id))
for choice_idx, (tokens, input_ids, input_mask, segment_ids) in enumerate(choices_features):
logger.info("choice: {}".format(choice_idx))
logger.info("tokens: {}".format(' '.join(tokens)))
logger.info("input_ids: {}".format(' '.join(map(str, input_ids))))
logger.info("input_mask: {}".format(' '.join(map(str, input_mask))))
logger.info("segment_ids: {}".format(' '.join(map(str, segment_ids))))
logger.info("label: {}".format(label))
features.append(
InputFeatures(
example_id = example.example_id,
choices_features = choices_features,
label = label
)
)
return features
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
# However, since we'd better not to remove tokens of options and questions, you can choose to use a bigger
# length or only pop from context
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
logger.info('Attention! you are removing from token_b (swag task is ok). '
'If you are training ARC and RACE (you are poping question + options), '
'you need to try to use a bigger max seq length!')
tokens_b.pop()
processors = {
"race": RaceProcessor,
"swag": SwagProcessor,
"arc": ArcProcessor
}
GLUE_TASKS_NUM_LABELS = {
"race", 4,
"swag", 4,
"arc", 4
}
from pytorch_transformers import (
AutoTokenizer, AutoConfig, AutoModel, AutoModelWithLMHead, AutoModelForSequenceClassification, AutoModelForQuestionAnswering
)
from pytorch_transformers.modeling_utils import add_start_docstrings
from pytorch_transformers.file_utils import add_start_docstrings
dependencies = ['torch', 'tqdm', 'boto3', 'requests', 'regex', 'sentencepiece', 'sacremoses']
......
......@@ -164,4 +164,12 @@ if _tf_available and _torch_available:
from .file_utils import (PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
cached_path, add_start_docstrings, add_end_docstrings,
WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME,
is_tf_available, is_torch_available)
\ No newline at end of file
is_tf_available, is_torch_available)
from .data import (is_sklearn_available,
InputExample, InputFeatures, DataProcessor,
glue_output_modes, glue_convert_examples_to_features,
glue_processors, glue_tasks_num_labels)
if is_sklearn_available():
from .data import glue_compute_metrics
from .processors import InputExample, InputFeatures, DataProcessor
from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
from .metrics import is_sklearn_available
if is_sklearn_available():
from .metrics import glue_compute_metrics
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import csv
import sys
import logging
logger = logging.getLogger(__name__)
try:
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import matthews_corrcoef, f1_score
_has_sklearn = True
except (AttributeError, ImportError) as e:
logger.warning("To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html")
_has_sklearn = False
def is_sklearn_available():
return _has_sklearn
if _has_sklearn:
def simple_accuracy(preds, labels):
return (preds == labels).mean()
def acc_and_f1(preds, labels):
acc = simple_accuracy(preds, labels)
f1 = f1_score(y_true=labels, y_pred=preds)
return {
"acc": acc,
"f1": f1,
"acc_and_f1": (acc + f1) / 2,
}
def pearson_and_spearman(preds, labels):
pearson_corr = pearsonr(preds, labels)[0]
spearman_corr = spearmanr(preds, labels)[0]
return {
"pearson": pearson_corr,
"spearmanr": spearman_corr,
"corr": (pearson_corr + spearman_corr) / 2,
}
def glue_compute_metrics(task_name, preds, labels):
assert len(preds) == len(labels)
if task_name == "cola":
return {"mcc": matthews_corrcoef(labels, preds)}
elif task_name == "sst-2":
return {"acc": simple_accuracy(preds, labels)}
elif task_name == "mrpc":
return acc_and_f1(preds, labels)
elif task_name == "sts-b":
return pearson_and_spearman(preds, labels)
elif task_name == "qqp":
return acc_and_f1(preds, labels)
elif task_name == "mnli":
return {"acc": simple_accuracy(preds, labels)}
elif task_name == "mnli-mm":
return {"acc": simple_accuracy(preds, labels)}
elif task_name == "qnli":
return {"acc": simple_accuracy(preds, labels)}
elif task_name == "rte":
return {"acc": simple_accuracy(preds, labels)}
elif task_name == "wnli":
return {"acc": simple_accuracy(preds, labels)}
else:
raise KeyError(task_name)
from .utils import InputExample, InputFeatures, DataProcessor
from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features
......@@ -13,79 +13,81 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" BERT classification fine-tuning: utilities to work with GLUE tasks """
""" GLUE processors and helpers """
from __future__ import absolute_import, division, print_function
import csv
import logging
import os
import sys
from io import open
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import matthews_corrcoef, f1_score
from .utils import DataProcessor, InputExample, InputFeatures
logger = logging.getLogger(__name__)
def glue_convert_examples_to_features(examples, label_list, max_seq_length,
tokenizer, output_mode,
pad_on_left=False,
pad_token=0,
pad_token_segment_id=0,
mask_padding_with_zero=True):
"""
Loads a data file into a list of `InputBatch`s
"""
class InputExample(object):
"""A single training/test example for simple sequence classification."""
def __init__(self, guid, text_a, text_b=None, label=None):
"""Constructs a InputExample.
Args:
guid: Unique id for the example.
text_a: string. The untokenized text of the first sequence. For single
sequence tasks, only this sequence must be specified.
text_b: (Optional) string. The untokenized text of the second sequence.
Only must be specified for sequence pair tasks.
label: (Optional) string. The label of the example. This should be
specified for train and dev examples, but not for test examples.
"""
self.guid = guid
self.text_a = text_a
self.text_b = text_b
self.label = label
label_map = {label: i for i, label in enumerate(label_list)}
features = []
for (ex_index, example) in enumerate(examples):
if ex_index % 10000 == 0:
logger.info("Writing example %d of %d" % (ex_index, len(examples)))
class InputFeatures(object):
"""A single set of features of data."""
inputs = tokenizer.encode_plus(
example.text_a,
example.text_b,
add_special_tokens=True,
max_length=max_seq_length,
truncate_first_sequence=True # We're truncating the first sequence as a priority
)
input_ids, segment_ids = inputs["input_ids"], inputs["token_type_ids"]
def __init__(self, input_ids, input_mask, segment_ids, label_id):
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.label_id = label_id
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
# Zero-pad up to the sequence length.
padding_length = max_seq_length - len(input_ids)
if pad_on_left:
input_ids = ([pad_token] * padding_length) + input_ids
input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
else:
input_ids = input_ids + ([pad_token] * padding_length)
input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
class DataProcessor(object):
"""Base class for data converters for sequence classification data sets."""
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
def get_train_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the train set."""
raise NotImplementedError()
if output_mode == "classification":
label_id = label_map[example.label]
elif output_mode == "regression":
label_id = float(example.label)
else:
raise KeyError(output_mode)
def get_dev_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the dev set."""
raise NotImplementedError()
if ex_index < 5:
logger.info("*** Example ***")
logger.info("guid: %s" % (example.guid))
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
logger.info("label: %s (id = %d)" % (example.label, label_id))
def get_labels(self):
"""Gets the list of labels for this data set."""
raise NotImplementedError()
@classmethod
def _read_tsv(cls, input_file, quotechar=None):
"""Reads a tab separated value file."""
with open(input_file, "r", encoding="utf-8-sig") as f:
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
lines = []
for line in reader:
if sys.version_info[0] == 2:
line = list(unicode(cell, 'utf-8') for cell in line)
lines.append(line)
return lines
features.append(
InputFeatures(input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
label_id=label_id))
return features
class MrpcProcessor(DataProcessor):
......@@ -302,7 +304,7 @@ class QnliProcessor(DataProcessor):
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(
self._read_tsv(os.path.join(data_dir, "dev.tsv")),
self._read_tsv(os.path.join(data_dir, "dev.tsv")),
"dev_matched")
def get_labels(self):
......@@ -387,198 +389,19 @@ class WnliProcessor(DataProcessor):
InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
return examples
glue_tasks_num_labels = {
"cola": 2,
"mnli": 3,
"mrpc": 2,
"sst-2": 2,
"sts-b": 1,
"qqp": 2,
"qnli": 2,
"rte": 2,
"wnli": 2,
}
def convert_examples_to_features(examples, label_list, max_seq_length,
tokenizer, output_mode,
cls_token_at_end=False,
cls_token='[CLS]',
cls_token_segment_id=1,
sep_token='[SEP]',
sep_token_extra=False,
pad_on_left=False,
pad_token=0,
pad_token_segment_id=0,
sequence_a_segment_id=0,
sequence_b_segment_id=1,
mask_padding_with_zero=True):
""" Loads a data file into a list of `InputBatch`s
`cls_token_at_end` define the location of the CLS token:
- False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
- True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
`cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
"""
label_map = {label : i for i, label in enumerate(label_list)}
features = []
for (ex_index, example) in enumerate(examples):
if ex_index % 10000 == 0:
logger.info("Writing example %d of %d" % (ex_index, len(examples)))
tokens_a = tokenizer.tokenize(example.text_a)
tokens_b = None
if example.text_b:
tokens_b = tokenizer.tokenize(example.text_b)
# Modifies `tokens_a` and `tokens_b` in place so that the total
# length is less than the specified length.
# Account for [CLS], [SEP], [SEP] with "- 3". " -4" for RoBERTa.
special_tokens_count = 4 if sep_token_extra else 3
_truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
else:
# Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
special_tokens_count = 3 if sep_token_extra else 2
if len(tokens_a) > max_seq_length - special_tokens_count:
tokens_a = tokens_a[:(max_seq_length - special_tokens_count)]
# The convention in BERT is:
# (a) For sequence pairs:
# tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
# type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1
# (b) For single sequences:
# tokens: [CLS] the dog is hairy . [SEP]
# type_ids: 0 0 0 0 0 0 0
#
# Where "type_ids" are used to indicate whether this is the first
# sequence or the second sequence. The embedding vectors for `type=0` and
# `type=1` were learned during pre-training and are added to the wordpiece
# embedding vector (and position vector). This is not *strictly* necessary
# since the [SEP] token unambiguously separates the sequences, but it makes
# it easier for the model to learn the concept of sequences.
#
# For classification tasks, the first vector (corresponding to [CLS]) is
# used as as the "sentence vector". Note that this only makes sense because
# the entire model is fine-tuned.
tokens = tokens_a + [sep_token]
if sep_token_extra:
# roberta uses an extra separator b/w pairs of sentences
tokens += [sep_token]
segment_ids = [sequence_a_segment_id] * len(tokens)
if tokens_b:
tokens += tokens_b + [sep_token]
segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)
if cls_token_at_end:
tokens = tokens + [cls_token]
segment_ids = segment_ids + [cls_token_segment_id]
else:
tokens = [cls_token] + tokens
segment_ids = [cls_token_segment_id] + segment_ids
input_ids = tokenizer.convert_tokens_to_ids(tokens)
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
# Zero-pad up to the sequence length.
padding_length = max_seq_length - len(input_ids)
if pad_on_left:
input_ids = ([pad_token] * padding_length) + input_ids
input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
else:
input_ids = input_ids + ([pad_token] * padding_length)
input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
if output_mode == "classification":
label_id = label_map[example.label]
elif output_mode == "regression":
label_id = float(example.label)
else:
raise KeyError(output_mode)
if ex_index < 5:
logger.info("*** Example ***")
logger.info("guid: %s" % (example.guid))
logger.info("tokens: %s" % " ".join(
[str(x) for x in tokens]))
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
logger.info("label: %s (id = %d)" % (example.label, label_id))
features.append(
InputFeatures(input_ids=input_ids,
input_mask=input_mask,
segment_ids=segment_ids,
label_id=label_id))
return features
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
"""Truncates a sequence pair in place to the maximum length."""
# This is a simple heuristic which will always truncate the longer sequence
# one token at a time. This makes more sense than truncating an equal percent
# of tokens from each, since if one sequence is very short then each token
# that's truncated likely contains more information than a longer sequence.
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()
def simple_accuracy(preds, labels):
return (preds == labels).mean()
def acc_and_f1(preds, labels):
acc = simple_accuracy(preds, labels)
f1 = f1_score(y_true=labels, y_pred=preds)
return {
"acc": acc,
"f1": f1,
"acc_and_f1": (acc + f1) / 2,
}
def pearson_and_spearman(preds, labels):
pearson_corr = pearsonr(preds, labels)[0]
spearman_corr = spearmanr(preds, labels)[0]
return {
"pearson": pearson_corr,
"spearmanr": spearman_corr,
"corr": (pearson_corr + spearman_corr) / 2,
}
def compute_metrics(task_name, preds, labels):
assert len(preds) == len(labels)
if task_name == "cola":
return {"mcc": matthews_corrcoef(labels, preds)}
elif task_name == "sst-2":
return {"acc": simple_accuracy(preds, labels)}
elif task_name == "mrpc":
return acc_and_f1(preds, labels)
elif task_name == "sts-b":
return pearson_and_spearman(preds, labels)
elif task_name == "qqp":
return acc_and_f1(preds, labels)
elif task_name == "mnli":
return {"acc": simple_accuracy(preds, labels)}
elif task_name == "mnli-mm":
return {"acc": simple_accuracy(preds, labels)}
elif task_name == "qnli":
return {"acc": simple_accuracy(preds, labels)}
elif task_name == "rte":
return {"acc": simple_accuracy(preds, labels)}
elif task_name == "wnli":
return {"acc": simple_accuracy(preds, labels)}
else:
raise KeyError(task_name)
processors = {
glue_processors = {
"cola": ColaProcessor,
"mnli": MnliProcessor,
"mnli-mm": MnliMismatchedProcessor,
......@@ -591,7 +414,7 @@ processors = {
"wnli": WnliProcessor,
}
output_modes = {
glue_output_modes = {
"cola": "classification",
"mnli": "classification",
"mnli-mm": "classification",
......@@ -603,15 +426,3 @@ output_modes = {
"rte": "classification",
"wnli": "classification",
}
GLUE_TASKS_NUM_LABELS = {
"cola": 2,
"mnli": 3,
"mrpc": 2,
"sst-2": 2,
"sts-b": 1,
"qqp": 2,
"qnli": 2,
"rte": 2,
"wnli": 2,
}
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import csv
import sys
class InputExample(object):
"""A single training/test example for simple sequence classification."""
def __init__(self, guid, text_a, text_b=None, label=None):
"""Constructs a InputExample.
Args:
guid: Unique id for the example.
text_a: string. The untokenized text of the first sequence. For single
sequence tasks, only this sequence must be specified.
text_b: (Optional) string. The untokenized text of the second sequence.
Only must be specified for sequence pair tasks.
label: (Optional) string. The label of the example. This should be
specified for train and dev examples, but not for test examples.
"""
self.guid = guid
self.text_a = text_a
self.text_b = text_b
self.label = label
class InputFeatures(object):
"""A single set of features of data."""
def __init__(self, input_ids, input_mask, segment_ids, label_id):
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.label_id = label_id
class DataProcessor(object):
"""Base class for data converters for sequence classification data sets."""
def get_train_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the train set."""
raise NotImplementedError()
def get_dev_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the dev set."""
raise NotImplementedError()
def get_labels(self):
"""Gets the list of labels for this data set."""
raise NotImplementedError()
@classmethod
def _read_tsv(cls, input_file, quotechar=None):
"""Reads a tab separated value file."""
with open(input_file, "r", encoding="utf-8-sig") as f:
reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
lines = []
for line in reader:
if sys.version_info[0] == 2:
line = list(unicode(cell, 'utf-8') for cell in line)
lines.append(line)
return lines
......@@ -533,7 +533,7 @@ BERT_INPUTS_DOCSTRING = r"""
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
"""
@add_start_docstrings("The bare Bert Model transformer outputing raw hidden-states without any specific head on top.",
@add_start_docstrings("The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
class BertModel(BertPreTrainedModel):
r"""
......
......@@ -394,7 +394,7 @@ DISTILBERT_INPUTS_DOCSTRING = r"""
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
"""
@add_start_docstrings("The bare DistilBERT encoder/transformer outputing raw hidden-states without any specific head on top.",
@add_start_docstrings("The bare DistilBERT encoder/transformer outputting raw hidden-states without any specific head on top.",
DISTILBERT_START_DOCSTRING, DISTILBERT_INPUTS_DOCSTRING)
class DistilBertModel(DistilBertPreTrainedModel):
r"""
......
......@@ -290,7 +290,7 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs:
Indices of input sequence tokens in the vocabulary.
GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on
the right rather than the left.
Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
Indices can be obtained using :class:`pytorch_transformers.GPT2Tokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**past**:
......@@ -314,7 +314,7 @@ GPT2_INPUTS_DOCSTRING = r""" Inputs:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
"""
@add_start_docstrings("The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.",
@add_start_docstrings("The bare GPT2 Model transformer outputting raw hidden-states without any specific head on top.",
GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
class GPT2Model(GPT2PreTrainedModel):
r"""
......
......@@ -324,7 +324,7 @@ OPENAI_GPT_INPUTS_DOCSTRING = r""" Inputs:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
"""
@add_start_docstrings("The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.",
@add_start_docstrings("The bare OpenAI GPT transformer model outputting raw hidden-states without any specific head on top.",
OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
r"""
......
......@@ -124,7 +124,7 @@ ROBERTA_INPUTS_DOCSTRING = r"""
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
"""
@add_start_docstrings("The bare RoBERTa Model transformer outputing raw hidden-states without any specific head on top.",
@add_start_docstrings("The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
class RobertaModel(BertModel):
r"""
......@@ -296,7 +296,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
Examples::
tokenizer = RoertaTokenizer.from_pretrained('roberta-base')
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base')
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
......@@ -338,6 +338,113 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
return outputs # (loss), logits, (hidden_states), (attentions)
@add_start_docstrings("""Roberta Model with a multiple choice classification head on top (a linear layer on top of
the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
class RobertaForMultipleChoice(BertPreTrainedModel):
r"""
Inputs:
**input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Indices of input sequence tokens in the vocabulary.
The second dimension of the input (`num_choices`) indicates the number of choices to score.
To match pre-training, RoBerta input sequence should be formatted with [CLS] and [SEP] tokens as follows:
(a) For sequence pairs:
``tokens: [CLS] is this jack ##son ##ville ? [SEP] [SEP] no it is not . [SEP]``
``token_type_ids: 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1``
(b) For single sequences:
``tokens: [CLS] the dog is hairy . [SEP]``
``token_type_ids: 0 0 0 0 0 0 0``
Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
:func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
**token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Segment token indices to indicate first and second portions of the inputs.
The second dimension of the input (`num_choices`) indicates the number of choices to score.
Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
**attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
Mask to avoid performing attention on padding token indices.
The second dimension of the input (`num_choices`) indicates the number of choices to score.
Mask values selected in ``[0, 1]``:
``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
**head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
Mask to nullify selected heads of the self-attention modules.
Mask values selected in ``[0, 1]``:
``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
**labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
Labels for computing the multiple choice classification loss.
Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above)
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
**loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
Classification loss.
**classification_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)`` where `num_choices` is the size of the second dimension
of the input tensors. (see `input_ids` above).
Classification scores (before SoftMax).
**hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
of shape ``(batch_size, sequence_length, hidden_size)``:
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
**attentions**: (`optional`, returned when ``config.output_attentions=True``)
list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
Examples::
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForMultipleChoice.from_pretrained('roberta-base')
choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices
labels = torch.tensor(1).unsqueeze(0) # Batch size 1
outputs = model(input_ids, labels=labels)
loss, classification_scores = outputs[:2]
"""
config_class = RobertaConfig
pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
base_model_prefix = "roberta"
def __init__(self, config):
super(RobertaForMultipleChoice, self).__init__(config)
self.roberta = RobertaModel(config)
self.dropout = nn.Dropout(config.hidden_dropout_prob)
self.classifier = nn.Linear(config.hidden_size, 1)
self.init_weights()
def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
position_ids=None, head_mask=None):
num_choices = input_ids.shape[1]
flat_input_ids = input_ids.view(-1, input_ids.size(-1))
flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
outputs = self.roberta(flat_input_ids, position_ids=flat_position_ids, token_type_ids=flat_token_type_ids,
attention_mask=flat_attention_mask, head_mask=head_mask)
pooled_output = outputs[1]
pooled_output = self.dropout(pooled_output)
logits = self.classifier(pooled_output)
reshaped_logits = logits.view(-1, num_choices)
outputs = (reshaped_logits,) + outputs[2:] # add hidden states and attention if they are here
if labels is not None:
loss_fct = CrossEntropyLoss()
loss = loss_fct(reshaped_logits, labels)
outputs = (loss,) + outputs
return outputs # (loss), reshaped_logits, (hidden_states), (attentions)
class RobertaClassificationHead(nn.Module):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment