"vscode:/vscode.git/clone" did not exist on "930c9412b4a5f558f03ff1622bf399c2cedea21a"
Unverified Commit 403d3098 authored by Sylvain Gugger's avatar Sylvain Gugger Committed by GitHub
Browse files

Hans data (#4854)

* Update hans data to be able to use Trainer

* Fixes

* Deal with tokenizer that don't have token_ids

* Clean up things

* Simplify data use

* Fix the input dict

* Formatting + proper path in README
parent ca5e1cdf
...@@ -11,7 +11,7 @@ export HANS_DIR=path-to-hans ...@@ -11,7 +11,7 @@ export HANS_DIR=path-to-hans
export MODEL_TYPE=type-of-the-model-e.g.-bert-roberta-xlnet-etc export MODEL_TYPE=type-of-the-model-e.g.-bert-roberta-xlnet-etc
export MODEL_PATH=path-to-the-model-directory-that-is-trained-on-NLI-e.g.-by-using-run_glue.py export MODEL_PATH=path-to-the-model-directory-that-is-trained-on-NLI-e.g.-by-using-run_glue.py
python examples/hans/test_hans.py \ python examples/adversarial/test_hans.py \
--task_name hans \ --task_name hans \
--model_type $MODEL_TYPE \ --model_type $MODEL_TYPE \
--do_eval \ --do_eval \
......
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" GLUE processors and helpers """
import logging
import os
from transformers.file_utils import is_tf_available
from utils_hans import DataProcessor, InputExample, InputFeatures
if is_tf_available():
import tensorflow as tf
logger = logging.getLogger(__name__)
def hans_convert_examples_to_features(
examples,
tokenizer,
max_length=512,
task=None,
label_list=None,
output_mode=None,
pad_on_left=False,
pad_token=0,
pad_token_segment_id=0,
mask_padding_with_zero=True,
):
"""
Loads a data file into a list of ``InputFeatures``
Args:
examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
tokenizer: Instance of a tokenizer that will tokenize the examples
max_length: Maximum example length
task: HANS
label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
output_mode: String indicating the output mode. Either ``regression`` or ``classification``
pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
pad_token: Padding token
pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
actual values)
Returns:
If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
containing the task-specific features. If the input is a list of ``InputExamples``, will return
a list of task-specific ``InputFeatures`` which can be fed to the model.
"""
is_tf_dataset = False
if is_tf_available() and isinstance(examples, tf.data.Dataset):
is_tf_dataset = True
if task is not None:
processor = glue_processors[task]()
if label_list is None:
label_list = processor.get_labels()
logger.info("Using label list %s for task %s" % (label_list, task))
if output_mode is None:
output_mode = glue_output_modes[task]
logger.info("Using output mode %s for task %s" % (output_mode, task))
label_map = {label: i for i, label in enumerate(label_list)}
features = []
for (ex_index, example) in enumerate(examples):
if ex_index % 10000 == 0:
logger.info("Writing example %d" % (ex_index))
if is_tf_dataset:
example = processor.get_example_from_tensor_dict(example)
example = processor.tfds_map(example)
inputs = tokenizer.encode_plus(example.text_a, example.text_b, add_special_tokens=True, max_length=max_length,)
input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
# The mask has 1 for real tokens and 0 for padding tokens. Only real
# tokens are attended to.
attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
# Zero-pad up to the sequence length.
padding_length = max_length - len(input_ids)
if pad_on_left:
input_ids = ([pad_token] * padding_length) + input_ids
attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
else:
input_ids = input_ids + ([pad_token] * padding_length)
attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(
len(attention_mask), max_length
)
assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(
len(token_type_ids), max_length
)
if output_mode == "classification":
label = label_map[example.label] if example.label in label_map else 0
elif output_mode == "regression":
label = float(example.label)
else:
raise KeyError(output_mode)
pairID = str(example.pairID)
if ex_index < 10:
logger.info("*** Example ***")
logger.info("text_a: %s" % (example.text_a))
logger.info("text_b: %s" % (example.text_b))
logger.info("guid: %s" % (example.guid))
logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
logger.info("label: %s (id = %d)" % (example.label, label))
features.append(
InputFeatures(
input_ids=input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
label=label,
pairID=pairID,
)
)
if is_tf_available() and is_tf_dataset:
def gen():
for ex in features:
yield (
{
"input_ids": ex.input_ids,
"attention_mask": ex.attention_mask,
"token_type_ids": ex.token_type_ids,
},
ex.label,
)
return tf.data.Dataset.from_generator(
gen,
({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
(
{
"input_ids": tf.TensorShape([None]),
"attention_mask": tf.TensorShape([None]),
"token_type_ids": tf.TensorShape([None]),
},
tf.TensorShape([]),
),
)
return features
class HansProcessor(DataProcessor):
"""Processor for the HANS data set."""
def get_example_from_tensor_dict(self, tensor_dict):
"""See base class."""
return InputExample(
tensor_dict["idx"].numpy(),
tensor_dict["premise"].numpy().decode("utf-8"),
tensor_dict["hypothesis"].numpy().decode("utf-8"),
str(tensor_dict["label"].numpy()),
)
def get_train_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_train_set.txt")), "train")
def get_dev_examples(self, data_dir):
"""See base class."""
return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_evaluation_set.txt")), "dev")
def get_labels(self):
"""See base class."""
return ["contradiction", "entailment", "neutral"]
def _create_examples(self, lines, set_type):
"""Creates examples for the training and dev sets."""
examples = []
for (i, line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, line[0])
text_a = line[5]
text_b = line[6]
pairID = line[7][2:] if line[7].startswith("ex") else line[7]
label = line[-1]
examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, pairID=pairID))
return examples
glue_tasks_num_labels = {
"hans": 3,
}
glue_processors = {
"hans": HansProcessor,
}
glue_output_modes = {
"hans": "classification",
}
...@@ -25,13 +25,10 @@ import random ...@@ -25,13 +25,10 @@ import random
import numpy as np import numpy as np
import torch import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange from tqdm import tqdm, trange
from hans_processors import glue_output_modes as output_modes
from hans_processors import glue_processors as processors
from hans_processors import hans_convert_examples_to_features as convert_examples_to_features
from transformers import ( from transformers import (
WEIGHTS_NAME, WEIGHTS_NAME,
AdamW, AdamW,
...@@ -41,6 +38,7 @@ from transformers import ( ...@@ -41,6 +38,7 @@ from transformers import (
BertConfig, BertConfig,
BertForSequenceClassification, BertForSequenceClassification,
BertTokenizer, BertTokenizer,
DefaultDataCollator,
DistilBertConfig, DistilBertConfig,
DistilBertForSequenceClassification, DistilBertForSequenceClassification,
DistilBertTokenizer, DistilBertTokenizer,
...@@ -55,6 +53,7 @@ from transformers import ( ...@@ -55,6 +53,7 @@ from transformers import (
XLNetTokenizer, XLNetTokenizer,
get_linear_schedule_with_warmup, get_linear_schedule_with_warmup,
) )
from utils_hans import HansDataset, hans_output_modes, hans_processors
try: try:
...@@ -91,7 +90,12 @@ def train(args, train_dataset, model, tokenizer): ...@@ -91,7 +90,12 @@ def train(args, train_dataset, model, tokenizer):
args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) train_dataloader = DataLoader(
train_dataset,
sampler=train_sampler,
batch_size=args.train_batch_size,
collate_fn=DefaultDataCollator().collate_batch,
)
if args.max_steps > 0: if args.max_steps > 0:
t_total = args.max_steps t_total = args.max_steps
...@@ -153,12 +157,7 @@ def train(args, train_dataset, model, tokenizer): ...@@ -153,12 +157,7 @@ def train(args, train_dataset, model, tokenizer):
epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
for step, batch in enumerate(epoch_iterator): for step, batch in enumerate(epoch_iterator):
model.train() model.train()
batch = tuple(t.to(args.device) for t in batch) inputs = {k: t.to(args.device) for k, t in batch.items() if k != "pairID"}
inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
if args.model_type != "distilbert":
inputs["token_type_ids"] = (
batch[2] if args.model_type in ["bert", "xlnet"] else None
) # XLM, DistilBERT and RoBERTa don't use segment_ids
outputs = model(**inputs) outputs = model(**inputs)
loss = outputs[0] # model outputs are always tuple in transformers (see doc) loss = outputs[0] # model outputs are always tuple in transformers (see doc)
...@@ -230,14 +229,21 @@ def train(args, train_dataset, model, tokenizer): ...@@ -230,14 +229,21 @@ def train(args, train_dataset, model, tokenizer):
return global_step, tr_loss / global_step return global_step, tr_loss / global_step
def evaluate(args, model, tokenizer, prefix=""): def evaluate(args, model, tokenizer, label_list, prefix=""):
# Loop to handle MNLI double evaluation (matched, mis-matched) # Loop to handle MNLI double evaluation (matched, mis-matched)
eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,) eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else (args.output_dir,) eval_outputs_dirs = (args.output_dir, args.output_dir + "-MM") if args.task_name == "mnli" else (args.output_dir,)
results = {} results = {}
for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs): for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
eval_dataset, label_list = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True) eval_dataset = HansDataset(
args.data_dir,
tokenizer,
args.task_name,
args.max_seq_length,
overwrite_cache=args.overwrite_cache,
evaluate=True,
)
if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]: if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
os.makedirs(eval_output_dir) os.makedirs(eval_output_dir)
...@@ -245,7 +251,12 @@ def evaluate(args, model, tokenizer, prefix=""): ...@@ -245,7 +251,12 @@ def evaluate(args, model, tokenizer, prefix=""):
args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
# Note that DistributedSampler samples randomly # Note that DistributedSampler samples randomly
eval_sampler = SequentialSampler(eval_dataset) eval_sampler = SequentialSampler(eval_dataset)
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) eval_dataloader = DataLoader(
eval_dataset,
sampler=eval_sampler,
batch_size=args.eval_batch_size,
collate_fn=DefaultDataCollator().collate_batch,
)
# multi-gpu eval # multi-gpu eval
if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
...@@ -261,14 +272,9 @@ def evaluate(args, model, tokenizer, prefix=""): ...@@ -261,14 +272,9 @@ def evaluate(args, model, tokenizer, prefix=""):
out_label_ids = None out_label_ids = None
for batch in tqdm(eval_dataloader, desc="Evaluating"): for batch in tqdm(eval_dataloader, desc="Evaluating"):
model.eval() model.eval()
batch = tuple(t.to(args.device) for t in batch) inputs = {k: t.to(args.device) for k, t in batch.items() if k != "pairID"}
pair_ids = batch.pop("pairID", None)
with torch.no_grad(): with torch.no_grad():
inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
if args.model_type != "distilbert":
inputs["token_type_ids"] = (
batch[2] if args.model_type in ["bert", "xlnet"] else None
) # XLM, DistilBERT and RoBERTa don't use segment_ids
outputs = model(**inputs) outputs = model(**inputs)
tmp_eval_loss, logits = outputs[:2] tmp_eval_loss, logits = outputs[:2]
...@@ -277,11 +283,11 @@ def evaluate(args, model, tokenizer, prefix=""): ...@@ -277,11 +283,11 @@ def evaluate(args, model, tokenizer, prefix=""):
if preds is None: if preds is None:
preds = logits.detach().cpu().numpy() preds = logits.detach().cpu().numpy()
out_label_ids = inputs["labels"].detach().cpu().numpy() out_label_ids = inputs["labels"].detach().cpu().numpy()
pair_ids = batch[4].detach().cpu().numpy() pair_ids = pair_ids.detach().cpu().numpy()
else: else:
preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
pair_ids = np.append(pair_ids, batch[4].detach().cpu().numpy(), axis=0) pair_ids = np.append(pair_ids, pair_ids.detach().cpu().numpy(), axis=0)
eval_loss = eval_loss / nb_eval_steps eval_loss = eval_loss / nb_eval_steps
if args.output_mode == "classification": if args.output_mode == "classification":
...@@ -298,67 +304,6 @@ def evaluate(args, model, tokenizer, prefix=""): ...@@ -298,67 +304,6 @@ def evaluate(args, model, tokenizer, prefix=""):
return results return results
def load_and_cache_examples(args, task, tokenizer, evaluate=False):
if args.local_rank not in [-1, 0] and not evaluate:
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
processor = processors[task]()
output_mode = output_modes[task]
# Load data features from cache or dataset file
cached_features_file = os.path.join(
args.data_dir,
"cached_{}_{}_{}_{}".format(
"dev" if evaluate else "train",
list(filter(None, args.model_name_or_path.split("/"))).pop(),
str(args.max_seq_length),
str(task),
),
)
label_list = processor.get_labels()
if os.path.exists(cached_features_file) and not args.overwrite_cache:
logger.info("Loading features from cached file %s", cached_features_file)
features = torch.load(cached_features_file)
else:
logger.info("Creating features from dataset file at %s", args.data_dir)
if task in ["mnli", "mnli-mm"] and args.model_type in ["roberta"]:
# HACK(label indices are swapped in RoBERTa pretrained model)
label_list[1], label_list[2] = label_list[2], label_list[1]
examples = (
processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
)
features = convert_examples_to_features(
examples,
tokenizer,
label_list=label_list,
max_length=args.max_seq_length,
output_mode=output_mode,
pad_on_left=bool(args.model_type in ["xlnet"]), # pad on the left for xlnet
pad_token=tokenizer.pad_token_id,
pad_token_segment_id=tokenizer.pad_token_type_id,
)
if args.local_rank in [-1, 0]:
logger.info("Saving features into cached file %s", cached_features_file)
torch.save(features, cached_features_file)
if args.local_rank == 0 and not evaluate:
torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache
# Convert to Tensors and build dataset
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
if output_mode == "classification":
all_labels = torch.tensor([f.label for f in features], dtype=torch.long)
elif output_mode == "regression":
all_labels = torch.tensor([f.label for f in features], dtype=torch.float)
all_pair_ids = torch.tensor([int(f.pairID) for f in features], dtype=torch.long)
dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids, all_labels, all_pair_ids)
return dataset, label_list
def main(): def main():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
...@@ -389,7 +334,7 @@ def main(): ...@@ -389,7 +334,7 @@ def main():
default=None, default=None,
type=str, type=str,
required=True, required=True,
help="The name of the task to train selected in the list: " + ", ".join(processors.keys()), help="The name of the task to train selected in the list: " + ", ".join(hans_processors.keys()),
) )
parser.add_argument( parser.add_argument(
"--output_dir", "--output_dir",
...@@ -541,10 +486,10 @@ def main(): ...@@ -541,10 +486,10 @@ def main():
# Prepare GLUE task # Prepare GLUE task
args.task_name = args.task_name.lower() args.task_name = args.task_name.lower()
if args.task_name not in processors: if args.task_name not in hans_processors:
raise ValueError("Task not found: %s" % (args.task_name)) raise ValueError("Task not found: %s" % (args.task_name))
processor = processors[args.task_name]() processor = hans_processors[args.task_name]()
args.output_mode = output_modes[args.task_name] args.output_mode = hans_output_modes[args.task_name]
label_list = processor.get_labels() label_list = processor.get_labels()
num_labels = len(label_list) num_labels = len(label_list)
...@@ -581,7 +526,9 @@ def main(): ...@@ -581,7 +526,9 @@ def main():
# Training # Training
if args.do_train: if args.do_train:
train_dataset, _ = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) train_dataset = HansDataset(
args.data_dir, tokenizer, args.task_name, args.max_seq_length, overwrite_cache=args.overwrite_cache
)
global_step, tr_loss = train(args, train_dataset, model, tokenizer) global_step, tr_loss = train(args, train_dataset, model, tokenizer)
logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
...@@ -625,7 +572,7 @@ def main(): ...@@ -625,7 +572,7 @@ def main():
model = model_class.from_pretrained(checkpoint) model = model_class.from_pretrained(checkpoint)
model.to(args.device) model.to(args.device)
result = evaluate(args, model, tokenizer, prefix=prefix) result = evaluate(args, model, tokenizer, label_list, prefix=prefix)
result = dict((k + "_{}".format(global_step), v) for k, v in result.items()) result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
results.update(result) results.update(result)
......
...@@ -14,12 +14,30 @@ ...@@ -14,12 +14,30 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import copy import logging
import csv import os
import json from dataclasses import dataclass
from typing import List, Optional, Union
import tqdm
from filelock import FileLock
class InputExample(object): from transformers import (
DataProcessor,
PreTrainedTokenizer,
RobertaTokenizer,
RobertaTokenizerFast,
XLMRobertaTokenizer,
is_tf_available,
is_torch_available,
)
logger = logging.getLogger(__name__)
@dataclass(frozen=True)
class InputExample:
""" """
A single training/test example for simple sequence classification. A single training/test example for simple sequence classification.
...@@ -31,91 +49,304 @@ class InputExample(object): ...@@ -31,91 +49,304 @@ class InputExample(object):
Only must be specified for sequence pair tasks. Only must be specified for sequence pair tasks.
label: (Optional) string. The label of the example. This should be label: (Optional) string. The label of the example. This should be
specified for train and dev examples, but not for test examples. specified for train and dev examples, but not for test examples.
pairID: (Optional) string. Unique identifier for the pair of sentences.
""" """
def __init__(self, guid, text_a, text_b=None, label=None, pairID=None): guid: str
self.guid = guid text_a: str
self.text_a = text_a text_b: Optional[str] = None
self.text_b = text_b label: Optional[str] = None
self.label = label pairID: Optional[str] = None
self.pairID = pairID
def __repr__(self):
return str(self.to_json_string())
def to_dict(self):
"""Serializes this instance to a Python dictionary."""
output = copy.deepcopy(self.__dict__)
return output
def to_json_string(self):
"""Serializes this instance to a JSON string."""
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
class InputFeatures(object): @dataclass(frozen=True)
class InputFeatures:
""" """
A single set of features of data. A single set of features of data.
Property names are the same names as the corresponding inputs to a model.
Args: Args:
input_ids: Indices of input sequence tokens in the vocabulary. input_ids: Indices of input sequence tokens in the vocabulary.
attention_mask: Mask to avoid performing attention on padding token indices. attention_mask: Mask to avoid performing attention on padding token indices.
Mask values selected in ``[0, 1]``: Mask values selected in ``[0, 1]``:
Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens. Usually ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
token_type_ids: Segment token indices to indicate first and second portions of the inputs. token_type_ids: (Optional) Segment token indices to indicate first and second
label: Label corresponding to the input portions of the inputs. Only some models use them.
label: (Optional) Label corresponding to the input. Int for classification problems,
float for regression problems.
pairID: (Optional) Unique identifier for the pair of sentences.
""" """
def __init__(self, input_ids, attention_mask, token_type_ids, label, pairID=None): input_ids: List[int]
self.input_ids = input_ids attention_mask: Optional[List[int]] = None
self.attention_mask = attention_mask token_type_ids: Optional[List[int]] = None
self.token_type_ids = token_type_ids label: Optional[Union[int, float]] = None
self.label = label pairID: Optional[int] = None
self.pairID = pairID
def __repr__(self):
return str(self.to_json_string())
def to_dict(self): if is_torch_available():
"""Serializes this instance to a Python dictionary.""" import torch
output = copy.deepcopy(self.__dict__) from torch.utils.data.dataset import Dataset
return output
def to_json_string(self): class HansDataset(Dataset):
"""Serializes this instance to a JSON string.""" """
return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" This will be superseded by a framework-agnostic approach
soon.
"""
features: List[InputFeatures]
class DataProcessor(object): def __init__(
"""Base class for data converters for sequence classification data sets.""" self,
data_dir: str,
tokenizer: PreTrainedTokenizer,
task: str,
max_seq_length: Optional[int] = None,
overwrite_cache=False,
evaluate: bool = False,
):
processor = hans_processors[task]()
output_mode = hans_output_modes[task]
def get_example_from_tensor_dict(self, tensor_dict): cached_features_file = os.path.join(
"""Gets an example from a dict with tensorflow tensors data_dir,
"cached_{}_{}_{}_{}".format(
"dev" if evaluate else "train", tokenizer.__class__.__name__, str(max_seq_length), task,
),
)
Args: # Make sure only the first process in distributed training processes the dataset,
tensor_dict: Keys and values should match the corresponding Glue # and the others will use the cache.
tensorflow_dataset examples. lock_path = cached_features_file + ".lock"
with FileLock(lock_path):
if os.path.exists(cached_features_file) and not overwrite_cache:
logger.info(f"Loading features from cached file {cached_features_file}")
self.features = torch.load(cached_features_file)
else:
logger.info(f"Creating features from dataset file at {data_dir}")
label_list = processor.get_labels()
if task in ["mnli", "mnli-mm"] and tokenizer.__class__ in (
RobertaTokenizer,
RobertaTokenizerFast,
XLMRobertaTokenizer,
):
# HACK(label indices are swapped in RoBERTa pretrained model)
label_list[1], label_list[2] = label_list[2], label_list[1]
examples = (
processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir)
)
logger.info("Training examples: %s", len(examples))
# TODO clean up all this to leverage built-in features of tokenizers
self.features = hans_convert_examples_to_features(
examples, label_list, max_seq_length, tokenizer, output_mode
)
logger.info("Saving features into cached file %s", cached_features_file)
torch.save(self.features, cached_features_file)
def __len__(self):
return len(self.features)
def __getitem__(self, i) -> InputFeatures:
return self.features[i]
if is_tf_available():
import tensorflow as tf
class TFHansDataset:
"""
This will be superseded by a framework-agnostic approach
soon.
""" """
raise NotImplementedError()
features: List[InputFeatures]
def __init__(
self,
data_dir: str,
tokenizer: PreTrainedTokenizer,
task: str,
max_seq_length: Optional[int] = 128,
overwrite_cache=False,
evaluate: bool = False,
):
processor = hans_processors[task]()
output_mode = hans_output_modes[task]
label_list = processor.get_labels()
if task in ["mnli", "mnli-mm"] and tokenizer.__class__ in (
RobertaTokenizer,
RobertaTokenizerFast,
XLMRobertaTokenizer,
):
# HACK(label indices are swapped in RoBERTa pretrained model)
label_list[1], label_list[2] = label_list[2], label_list[1]
examples = processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir)
self.features = hans_convert_examples_to_features(
examples, label_list, max_seq_length, tokenizer, output_mode
)
def gen():
for (ex_index, ex) in tqdm.tqdm(enumerate(self.features), desc="convert examples to features"):
if ex_index % 10000 == 0:
logger.info("Writing example %d of %d" % (ex_index, len(examples)))
yield (
{
"example_id": 0,
"input_ids": ex.input_ids,
"attention_mask": ex.attention_mask,
"token_type_ids": ex.token_type_ids,
},
ex.label,
)
self.dataset = tf.data.Dataset.from_generator(
gen,
(
{
"example_id": tf.int32,
"input_ids": tf.int32,
"attention_mask": tf.int32,
"token_type_ids": tf.int32,
},
tf.int64,
),
(
{
"example_id": tf.TensorShape([]),
"input_ids": tf.TensorShape([None, None]),
"attention_mask": tf.TensorShape([None, None]),
"token_type_ids": tf.TensorShape([None, None]),
},
tf.TensorShape([]),
),
)
def get_dataset(self):
return self.dataset
def __len__(self):
return len(self.features)
def __getitem__(self, i) -> InputFeatures:
return self.features[i]
class HansProcessor(DataProcessor):
"""Processor for the HANS data set."""
def get_example_from_tensor_dict(self, tensor_dict):
"""See base class."""
return InputExample(
tensor_dict["idx"].numpy(),
tensor_dict["premise"].numpy().decode("utf-8"),
tensor_dict["hypothesis"].numpy().decode("utf-8"),
str(tensor_dict["label"].numpy()),
)
def get_train_examples(self, data_dir): def get_train_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the train set.""" """See base class."""
raise NotImplementedError() return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_train_set.txt")), "train")
def get_dev_examples(self, data_dir): def get_dev_examples(self, data_dir):
"""Gets a collection of `InputExample`s for the dev set.""" """See base class."""
raise NotImplementedError() return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_evaluation_set.txt")), "dev")
def get_labels(self): def get_labels(self):
"""Gets the list of labels for this data set.""" """See base class."""
raise NotImplementedError() return ["contradiction", "entailment", "neutral"]
@classmethod def _create_examples(self, lines, set_type):
def _read_tsv(cls, input_file, quotechar=None): """Creates examples for the training and dev sets."""
"""Reads a tab separated value file.""" examples = []
with open(input_file, "r", encoding="utf-8-sig") as f: for (i, line) in enumerate(lines):
reader = csv.reader(f, delimiter="\t", quotechar=quotechar) if i == 0:
lines = [] continue
for line in reader: guid = "%s-%s" % (set_type, line[0])
lines.append(line) text_a = line[5]
return lines text_b = line[6]
pairID = line[7][2:] if line[7].startswith("ex") else line[7]
label = line[-1]
examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, pairID=pairID))
return examples
def hans_convert_examples_to_features(
examples: List[InputExample],
label_list: List[str],
max_length: int,
tokenizer: PreTrainedTokenizer,
output_mode: str,
):
"""
Loads a data file into a list of ``InputFeatures``
Args:
examples: List of ``InputExamples`` containing the examples.
tokenizer: Instance of a tokenizer that will tokenize the examples.
max_length: Maximum example length.
label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method.
output_mode: String indicating the output mode. Either ``regression`` or ``classification``.
Returns:
A list of task-specific ``InputFeatures`` which can be fed to the model.
"""
label_map = {label: i for i, label in enumerate(label_list)}
features = []
for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
if ex_index % 10000 == 0:
logger.info("Writing example %d" % (ex_index))
inputs = tokenizer.encode_plus(
example.text_a,
example.text_b,
add_special_tokens=True,
max_length=max_length,
pad_to_max_length=True,
return_overflowing_tokens=True,
)
if "num_truncated_tokens" in inputs and inputs["num_truncated_tokens"] > 0:
logger.info(
"Attention! you are cropping tokens (swag task is ok). "
"If you are training ARC and RACE and you are poping question + options,"
"you need to try to use a bigger max seq length!"
)
if output_mode == "classification":
label = label_map[example.label] if example.label in label_map else 0
elif output_mode == "regression":
label = float(example.label)
else:
raise KeyError(output_mode)
pairID = int(example.pairID)
features.append(InputFeatures(**inputs, label=label, pairID=pairID))
for i, example in enumerate(examples[:5]):
logger.info("*** Example ***")
logger.info(f"guid: {example}")
logger.info(f"features: {features[i]}")
return features
hans_tasks_num_labels = {
"hans": 3,
}
hans_processors = {
"hans": HansProcessor,
}
hans_output_modes = {
"hans": "classification",
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment